/* * XMLConverter.java * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.greenstone.gsdl3.util; // XML classes import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.NamedNodeMap; import org.xml.sax.InputSource; import org.xml.sax.EntityResolver; import org.apache.xerces.parsers.DOMParser; import org.apache.xerces.dom.DocumentImpl; // for new Documents // other java classes import java.io.Reader; import java.io.InputStreamReader; import java.io.StringReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; /** XMLConverter - utility class for greenstone * * parses XML Strings into Documents, converts Nodes to Strings * different parsers have different behaviour - can experiment in here * now we only use xerces * * @author Katherine Don * @version $Revision: 9844 $ * */ public class XMLConverter { /** xerces parser */ protected DOMParser parser = null; private boolean outputEscaping = true; /** the no-args constructor */ public XMLConverter() { try { this.parser = new DOMParser(); this.parser.setFeature("http://xml.org/sax/features/validation", false); } catch (Exception e) { System.err.println("XMLConverter:exception "+e.getMessage()); } } /** sets the entity resolver. pass in null to unset it */ public void setEntityResolver(EntityResolver er) { this.parser.setEntityResolver(er); } /** returns a DOM Document */ public Document getDOM(String in) { try { Reader reader = new StringReader(in); InputSource xml_source = new InputSource(reader); this.parser.parse(xml_source); Document doc = this.parser.getDocument(); return doc; } catch (Exception e) { System.err.println("XMLConverter.getDOM(String): exception "+e.getMessage()); } return null; } /** returns a DOM Document */ public Document getDOM(File in) { try { FileReader reader = new FileReader(in); InputSource xml_source = new InputSource(reader); this.parser.parse(xml_source); Document doc = this.parser.getDocument(); return doc; } catch (Exception e) { System.err.println("XMLConverter.getDOM(File): exception "+e.getMessage()); e.printStackTrace(); } return null; } /** returns a DOM document */ public Document getDOM(File in, String encoding) { try { InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding); InputSource xml_source = new InputSource(isr); this.parser.parse(xml_source); Document doc = this.parser.getDocument(); return doc; } catch (Exception e) { System.err.println("XMLConverter.getDOM(File): exception "+e.getMessage()); } return null; } /** creates a new empty DOM Document */ public Document newDOM() { Document doc = new DocumentImpl(); return doc; } /** returns the Node as a String */ public String getString(Node xmlNode) { outputEscaping = true; return getString(xmlNode, 0, false); } /** returns the node as a nicely formatted String - this introduces extra * text nodes if the String is read back in as a DOM, so should only be * used for printing */ public String getPrettyString(Node xmlNode) { outputEscaping = true; return getString(xmlNode, 0, true); } private String getString(Node xmlNode, int depth, boolean pretty) { String xmlRepresentation = ""; if (xmlNode == null) return ""; short nodeType = xmlNode.getNodeType(); String nodeName = xmlNode.getNodeName(); if (nodeType == Node.DOCUMENT_NODE) { return getString(((Document)xmlNode).getDocumentElement(), depth, pretty); } // Handle Element nodes if (nodeType == Node.ELEMENT_NODE) { if (pretty) { xmlRepresentation += "\n"; for (int i = 0; i < depth; i++) { xmlRepresentation += " "; } } // Write opening tag xmlRepresentation += "<" + nodeName; // Write the node attributes NamedNodeMap nodeAttributes = xmlNode.getAttributes(); for (int i = 0; i < nodeAttributes.getLength(); i++) { Node attribute = nodeAttributes.item(i); xmlRepresentation += " " + attribute.getNodeName() + "="; xmlRepresentation += "\"" + attribute.getNodeValue() + "\""; } // If the node has no children, close the opening tag and return if (xmlNode.hasChildNodes() == false) { // This produces somewhat ugly output, but it is necessary to compensate // for display bugs in Netscape. Firstly, the space is needed before the // closing bracket otherwise Netscape will ignore some tags (
, for // example). Also, a newline character would be expected after the tag, // but this causes problems with the display of links (the link text // will contain a newline character, which is displayed badly). xmlRepresentation += " />"; return xmlRepresentation; } // Close the opening tag xmlRepresentation += ">"; // Apply recursively to the children of this node // hack for nodes next to text nodes - dont make them pretty // this is needed for text inside a

 element - any new lines 
	    // or spaces around the span elements show up in the text
	    NodeList children = xmlNode.getChildNodes();
	    boolean do_pretty = pretty;
	    for (int i = 0; i < children.getLength(); i++) {
		if (children.item(i).getNodeType()==Node.TEXT_NODE) {
		    do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
		}
		xmlRepresentation += getString(children.item(i), depth + 1, do_pretty);
	    }
	    
	    // Write closing tag
	    if (pretty) {
		if (xmlRepresentation.endsWith("\n")) {
		    for (int i = 0; i < depth; i++)
			xmlRepresentation += "  ";
		}
	    }
	    xmlRepresentation += "";
	    if (pretty) {
		xmlRepresentation += "\n";
	    }
	}
	
	// Handle Text nodes
	else if (nodeType == Node.TEXT_NODE) {
	    String text = xmlNode.getNodeValue();

	    // Perform output escaping, if required
	    if (outputEscaping) {
		text = text.replaceAll("&", "&");  // Must be done first!!
		text = text.replaceAll("<", "<");
		text = text.replaceAll(">", ">");
		text = text.replaceAll("\"", """);
		text = text.replaceAll("\'", "'");
	    }

	    // Remove any control-C characters
	    text = text.replaceAll("" + (char) 3, "");
	    xmlRepresentation += text;
	}

	// Handle Processing Instruction nodes
	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
	    if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
		outputEscaping = false;
	    }
	    else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
		outputEscaping = true;
	    }
	    else {
		System.err.println("XMLConverter.getString(): Warning: Unhandled processing instruction " + nodeName);
	    }
	}
	
	else if (nodeType == Node.COMMENT_NODE) {
	    String text = xmlNode.getNodeValue();
	    xmlRepresentation += "";
	}
	    
	    
	
	// A type of node that is not handled yet
	else {
	    System.err.println("XMLConverter.getString(): Warning: Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
	}

	return xmlRepresentation;
    }

    protected static String getNodeTypeString(short node_type) {

	String type = "";
	switch(node_type) {
	case Node.ATTRIBUTE_NODE: 
	    type="ATTRIBUTE_NODE";
	    break;
	case Node.CDATA_SECTION_NODE:
	    type="CDATA_SECTION_NODE";
	    break;
	case Node.COMMENT_NODE:
	    type="COMMENT_NODE";
	    break;
	case Node.DOCUMENT_FRAGMENT_NODE:
	    type="DOCUMENT_FRAGMENT_NODE";
	    break;
	case Node.DOCUMENT_NODE:
	    type="DOCUMENT_NODE";
	    break;
	case Node.DOCUMENT_TYPE_NODE:
	    type="DOCUMENT_TYPE_NODE";
	    break;
	case Node.ELEMENT_NODE:
	    type="ELEMENT_NODE";
	    break;
	case Node.ENTITY_NODE:
	    type="ENTITY_NODE";
	    break;
	case Node.ENTITY_REFERENCE_NODE:
	    type="ENTITY_REFERENCE_NODE";
	    break;
	case Node.NOTATION_NODE:
	    type="NOTATION_NODE";
	    break;
	case Node.PROCESSING_INSTRUCTION_NODE:
	    type="PROCESSING_INSTRUCTION_NODE";
	    break;
	case Node.TEXT_NODE:
	    type="TEXT_NODE";
	    break;
	default:
	    type="UNKNOWN";
	}

	return type;
    }
}