/* * XMLConverter.java * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.greenstone.gsdl3.util; // XML classes import org.w3c.dom.DOMImplementation; import org.w3c.dom.Document; import org.w3c.dom.DocumentType; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.NamedNodeMap; import org.xml.sax.InputSource; import org.xml.sax.EntityResolver; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXParseException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.apache.xerces.parsers.DOMParser; import org.apache.xerces.dom.DocumentImpl; // for new Documents import org.apache.xerces.dom.DocumentTypeImpl; // other java classes import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.FileWriter; import java.io.InputStream; import java.io.Reader; import java.io.InputStreamReader; import java.io.StringReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.util.regex.*; import org.apache.log4j.*; // Apache Commons import org.apache.commons.lang3.*; import java.util.*; import java.lang.reflect.*; /** * XMLConverter - utility class for greenstone * * generates new Documents * parses XML Strings into Documents, converts Nodes to Strings * different parsers have different behaviour - can experiment in here * at the moment we only use xerces * all xerces specific code is in here */ public class XMLConverter { static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName()); /** the no-args constructor */ public XMLConverter() { } /** returns a DOM Document */ public static Document getDOM(String in) { try { Reader reader = new StringReader(in); InputSource xml_source = new InputSource(reader); Document doc = getDOM(xml_source, null); reader.close(); return doc; } catch (Exception e) { logger.error(e.getMessage()); logger.error("Input string was:\n" + in); e.printStackTrace(); } return null; } /** returns a DOM Document */ public static Document getDOM(String in, String encoding) { try { InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding); InputSource xml_source = new InputSource(reader); Document doc = getDOM(xml_source, null); reader.close(); return doc; } catch (Exception e) { logger.error(e.getMessage()); logger.error("Input string was:\n" + in); e.printStackTrace(); } return null; } /** returns a DOM Document */ public static Document getDOM(File in) { try { FileReader reader = new FileReader(in); InputSource xml_source = new InputSource(reader); Document doc = getDOM(xml_source, null); reader.close(); return doc; } catch (Exception e) { logger.error(e.getMessage(), e); logger.error("File was:\n" + in.getPath()); e.printStackTrace(); } return null; } public static Document getDOM(File in, String encoding) { return getDOM(in, encoding, null); } /** returns a DOM document */ public static Document getDOM(File in, String encoding, EntityResolver er) { try { InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding); InputSource xml_source = new InputSource(isr); Document doc = getDOM(xml_source, er); isr.close(); return doc; } catch (Exception e) { logger.error(e.getMessage()); logger.error("File was:\n" + in.getPath()); e.printStackTrace(); } return null; } public static Document getDOM(File in, EntityResolver er) { try { InputSource xml_source = new InputSource(new FileInputStream(in)); Document doc = getDOM(xml_source, er); return doc; } catch (Exception e) { logger.error(e.getMessage()); logger.error("File was:\n" + in.getPath()); e.printStackTrace(); } return null; } public static Document getDOM(InputSource source, EntityResolver er) { try { DOMParser parser = new DOMParser(); parser.setFeature("http://xml.org/sax/features/validation", false); // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up. parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); // a performance test showed that having this on lead to increased // memory use for small-medium docs, and not much gain for large // docs. // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false); // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing parser.setErrorHandler(new ParseErrorHandler()); if (er != null) { parser.setEntityResolver(er); } parser.parse(source); Document doc = parser.getDocument(); return doc; } catch (Exception e) { logger.error(e.getMessage()); logger.error("InputSource:\n"); e.printStackTrace(); } return null; } /** creates a new empty DOM Document */ public static Document newDOM() { Document doc = new DocumentImpl(); return doc; } /** * This method's parameters represent the parts of the Doctype of this * Document that is to be created. For more info see * http://xerces.apache.org * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl * .html#DocumentTypeImpl * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String) * * */ public static Document newDOM(String qualifiedName, String publicID, String systemID) { // create empty DOM document DocumentImpl docImpl = new DocumentImpl(); // Need to use the document to create the docType for it DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID); // Although we have created the docType using the document, we need to still // put it into the empty document we just created try { docImpl.appendChild(myDocType); } catch (Exception e) { System.out.println("Could not append docType because: " + e); } // return the document containing a DocType return docImpl; } /** returns the Node as a String */ public static String getString(Node xmlNode) { StringBuffer xmlRepresentation = new StringBuffer(); getString(xmlNode, xmlRepresentation, 0, false); return xmlRepresentation.toString(); } /** * returns the node as a nicely formatted String - this introduces extra * text nodes if the String is read back in as a DOM, so should only be used * for printing */ public static String getPrettyString(Node xmlNode) { StringBuffer xmlRepresentation = new StringBuffer(); getString(xmlNode, xmlRepresentation, 0, true); return xmlRepresentation.toString(); } /* * For the purposes of logger.debug statements, where this is called and * hence outputted, returns an empty string if debugging is not enabled */ public static String getPrettyStringLogger(Node xmlNode, Logger log) { if (log.isDebugEnabled()) return getPrettyString(xmlNode); return ""; } private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty) { if (xmlNode == null) { xmlRepresentation.append(""); return; } short nodeType = xmlNode.getNodeType(); String nodeName = xmlNode.getNodeName(); if (nodeType == Node.DOCUMENT_NODE) { Document xmlDocNode = (Document) xmlNode; //if (xmlDocNode.getDoctype() == null) { //System.err.println("Doctype is null."); //} //else { if (xmlDocNode.getDoctype() != null) { DocumentType dt = xmlDocNode.getDoctype(); String name = dt.getName(); String pid = dt.getPublicId(); String sid = dt.getSystemId(); // Use previously assigned name, not dt.getName() again String doctype_str = "\n"; xmlRepresentation.append(doctype_str); } getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty); return; } // Handle Element nodes if (nodeType == Node.ELEMENT_NODE) { if (pretty) { xmlRepresentation.append("\n"); for (int i = 0; i < depth; i++) { xmlRepresentation.append(" "); } } // Write opening tag xmlRepresentation.append("<"); xmlRepresentation.append(nodeName); // Write the node attributes NamedNodeMap nodeAttributes = xmlNode.getAttributes(); for (int i = 0; i < nodeAttributes.getLength(); i++) { Node attribute = nodeAttributes.item(i); xmlRepresentation.append(" "); xmlRepresentation.append(attribute.getNodeName()); xmlRepresentation.append("=\""); String attr_val = attribute.getNodeValue(); attr_val = attr_val.replaceAll("&","&"); attr_val = attr_val.replaceAll("<","<"); attr_val = attr_val.replaceAll(">",">"); attr_val = attr_val.replaceAll("\"","""); // assume that any of the above chars that was already entity escaped // was already correct => return back to how they were attr_val = attr_val.replaceAll("&amp;","&"); attr_val = attr_val.replaceAll("&lt;","<"); attr_val = attr_val.replaceAll("&gt;",">"); attr_val = attr_val.replaceAll("&quot;","""); xmlRepresentation.append(attr_val); xmlRepresentation.append("\""); } // If the node has no children, close the opening tag and return if (xmlNode.hasChildNodes() == false) { // This produces somewhat ugly output, but it is necessary to compensate // for display bugs in Netscape. Firstly, the space is needed before the // closing bracket otherwise Netscape will ignore some tags (
, for // example). Also, a newline character would be expected after the tag, // but this causes problems with the display of links (the link text // will contain a newline character, which is displayed badly). xmlRepresentation.append(" />"); return; } // Close the opening tag xmlRepresentation.append(">"); // Process the children. We process text nodes here, but recursively process other nodes. // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented. // also if these are inside a pre tag then the space shows up in the page. NodeList children = xmlNode.getChildNodes(); boolean do_pretty = pretty; boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); short child_type = child.getNodeType(); if (child_type == Node.PROCESSING_INSTRUCTION_NODE) { if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) { output_escaping = false; } else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) { output_escaping = true; } else { logger.warn("Unhandled processing instruction " + child.getNodeName()); } } else if (child_type == Node.TEXT_NODE) { do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else // output the text String text = child.getNodeValue(); // Perform output escaping, if required // Apache Commons replace method is far superior to String.replaceAll - very fast! if (output_escaping) { text = StringUtils.replace(text, "&", "&"); text = StringUtils.replace(text, "<", "<"); text = StringUtils.replace(text, ">", ">"); text = StringUtils.replace(text, "'", "'"); text = StringUtils.replace(text, "\"", """); } // Remove any control-C characters text = StringUtils.replace(text, "" + (char) 3, ""); xmlRepresentation.append(text); } else { // recursively call getString getString(child, xmlRepresentation, depth + 1, do_pretty); } } // foreach child of the element // Write closing tag if (pretty) { if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n') { for (int i = 0; i < depth; i++) xmlRepresentation.append(" "); } } xmlRepresentation.append(""); if (pretty) { xmlRepresentation.append("\n"); } } // ELEMENT_NODE else if (nodeType == Node.COMMENT_NODE) { String text = xmlNode.getNodeValue(); xmlRepresentation.append(""); } // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node // A type of node that is not handled yet else { logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType)); } return; } protected static String getNodeTypeString(short node_type) { String type = ""; switch (node_type) { case Node.ATTRIBUTE_NODE: type = "ATTRIBUTE_NODE"; break; case Node.CDATA_SECTION_NODE: type = "CDATA_SECTION_NODE"; break; case Node.COMMENT_NODE: type = "COMMENT_NODE"; break; case Node.DOCUMENT_FRAGMENT_NODE: type = "DOCUMENT_FRAGMENT_NODE"; break; case Node.DOCUMENT_NODE: type = "DOCUMENT_NODE"; break; case Node.DOCUMENT_TYPE_NODE: type = "DOCUMENT_TYPE_NODE"; break; case Node.ELEMENT_NODE: type = "ELEMENT_NODE"; break; case Node.ENTITY_NODE: type = "ENTITY_NODE"; break; case Node.ENTITY_REFERENCE_NODE: type = "ENTITY_REFERENCE_NODE"; break; case Node.NOTATION_NODE: type = "NOTATION_NODE"; break; case Node.PROCESSING_INSTRUCTION_NODE: type = "PROCESSING_INSTRUCTION_NODE"; break; case Node.TEXT_NODE: type = "TEXT_NODE"; break; default: type = "UNKNOWN"; } return type; } // returns null if there no error occurred during parsing, or else returns the error message // public String getParseErrorMessage() // { // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler(); // return errorHandler.getErrorMessage(); // } // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page. static public class ParseErrorHandler implements ErrorHandler { protected String errorMessage = null; // Receive notification of a recoverable error. public void error(SAXParseException exception) { handleError("Error:\n", exception); } // Receive notification of a non-recoverable error. public void fatalError(SAXParseException exception) { handleError("Fatal Error:\n", exception); } // Receive notification of a warning. public void warning(SAXParseException exception) { handleError("Warning:\n", exception); } public String toString(SAXParseException e) { String msg = e.getMessage(); msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")"; msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID"; msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID"; return msg; } // clears the errorPage variable after first call to this method public String getErrorMessage() { String errMsg = this.errorMessage; if (this.errorMessage != null) { this.errorMessage = null; } return errMsg; } // sets the errorMessage member variable to the data stored in the exception // and writes the errorMessage to the logger and tomcat's System.err protected void handleError(String errorType, SAXParseException exception) { this.errorMessage = errorType + toString(exception); System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n"); logger.error(this.errorMessage); } } public static boolean writeDOM(Element elem, File file) { BufferedWriter writer = null; boolean success = false; try { String xml_string = getString(elem); // need createNewFile??? writer = new BufferedWriter(new FileWriter(file)); writer.write(xml_string); success = true; } catch (Exception e) { logger.error(e.getMessage()); success = false; } finally { try { if (writer != null) { writer.close(); } } catch(Exception e) { logger.error("couldn't close the file"+e.getMessage()); } } return success; } }