package org.greenstone.gatherer.util; import java.io.*; import java.net.*; import java.util.*; import org.apache.xerces.parsers.*; import org.apache.xml.serialize.*; import org.greenstone.gatherer.DebugStream; import org.w3c.dom.*; import org.xml.sax.*; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; // SAX import org.xml.sax.XMLReader; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.InputSource; // JAXP import javax.xml.parsers.FactoryConfigurationError; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; /** This class is a static class containing useful XML functions */ public class XMLTools { /** extracts the text out of a node */ public static Node getNodeTextNode (Element param) { param.normalize (); Node n = param.getFirstChild (); while (n!=null && n.getNodeType () !=Node.TEXT_NODE) { n=n.getNextSibling (); } return n; } /** extracts the text out of a node */ public static String getNodeText (Element param) { Node text_node = getNodeTextNode (param); if (text_node == null) { return ""; } return text_node.getNodeValue (); } public static void setNodeText (Element elem, String text) { Node old_text_node = getNodeTextNode (elem); if (old_text_node != null) { elem.removeChild (old_text_node); } Text t = elem.getOwnerDocument ().createTextNode (text); elem.appendChild (t); } /** returns the (first) child element with the given name */ public static Node getChildByTagName (Node n, String name) { Node child = n.getFirstChild (); while (child!=null) { if (child.getNodeName ().equals (name)) { return child; } child = child.getNextSibling (); } return null; //not found } /** returns the (nth) child element with the given name * index numbers start at 0 */ public static Node getChildByTagNameIndexed (Node n, String name, int index) { if (index == -1) { return getChildByTagName (n, name); } int count = 0; Node child = n.getFirstChild (); while (child!=null) { if (child.getNodeName ().equals (name)) { if (count == index) { return child; } else { count++; } } child = child.getNextSibling (); } return null; //not found } /** returns the element parent/node_name[@attribute_name='attribute_value'] */ public static Element getNamedElement (Element parent, String node_name, String attribute_name, String attribute_value) { NodeList children = parent.getChildNodes (); for (int i=0; iElement whose value we wish to find. * Soon to be deprecated! */ static final public String getValue (Node element) { if (element == null) { return ""; } // If we've been given a subject node first retrieve its value node. if(element.getNodeName ().equals ("Subject")) { element = getNodeFromNamed (element, "Value"); } // If we've got a value node, then reconstruct the text. Remember that DOM will split text over 256 characters into several text nodes if(element != null && element.hasChildNodes ()) { StringBuffer text_buffer = new StringBuffer (); NodeList text_nodes = element.getChildNodes (); for(int i = 0; i < text_nodes.getLength (); i++) { Node possible_text = text_nodes.item (i); if(possible_text.getNodeName ().equals (StaticStrings.TEXT_NODE)) { text_buffer.append (possible_text.getNodeValue ()); } } return text_buffer.toString (); } return ""; } /** Method to retrieve from the node given, a certain child node with the specified name. * @param parent The Node whose children should be searched. * @param name The required nodes name as a String. * @return The requested Node if it is found, null otherwise. * Soon to be deprecated! */ static final public Node getNodeFromNamed (Node parent, String name) { Node child = null; for(Node i = parent.getFirstChild (); i != null && child == null; i = i.getNextSibling ()) { if(i.getNodeName ().equals (name)) { child = i; } } return child; } static final public String WELLFORMED= "well-formed !"; static final public String NOTWELLFORMED= "not well-formed"; static final private String HEADER = ""; static final private String FOOTER = ""; public static String parse (String xml_str) { String validation_msg = WELLFORMED; xml_str = HEADER + xml_str + FOOTER; try { SAXParserFactory factory = SAXParserFactory.newInstance (); factory.setNamespaceAware (true); //factory.setValidating (true); SAXParser parser = factory.newSAXParser (); InputSource iSource = new InputSource ( new StringReader ( xml_str ) ); // parser.parse (iSource, new DefaultHandler ()); org.xml.sax.XMLReader reader = parser.getXMLReader (); reader.setContentHandler(new DefaultHandler()); reader.setErrorHandler(new DefaultHandler()); reader.parse(iSource); } catch (FactoryConfigurationError e) { validation_msg = "unable to get a document builder factory"; } catch (ParserConfigurationException e) { validation_msg = "unable to configure parser"; } catch (SAXParseException e) { validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage (); } catch (SAXException e) { validation_msg += " Fatal error: " + e.toString (); } catch (IOException e) { validation_msg = "Unable to read the input, i/o error"; } return validation_msg; } //In this method, the parsed string xml_str is not wrapped by the header and footer strings. public static String parseDOM (String xml_str) { String validation_msg = WELLFORMED; try { SAXParserFactory factory = SAXParserFactory.newInstance (); factory.setNamespaceAware (true); //factory.setValidating (true); SAXParser parser = factory.newSAXParser (); InputSource iSource = new InputSource ( new StringReader ( xml_str ) ); // parser.parse (iSource, new DefaultHandler ()); org.xml.sax.XMLReader reader = parser.getXMLReader (); reader.setContentHandler(new DefaultHandler()); reader.setErrorHandler(new DefaultHandler()); reader.parse(iSource); } catch (FactoryConfigurationError e) { validation_msg = "unable to get a document builder factory"; } catch (ParserConfigurationException e) { validation_msg = "unable to configure parser"; } catch (SAXParseException e) { validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage (); } catch (SAXException e) { validation_msg += " " + e.toString (); } catch (IOException e) { validation_msg = "Unable to read the input, i/o error"; } return validation_msg; } public static String parse (File xml_file) { String validation_msg = WELLFORMED; try { SAXParserFactory factory = SAXParserFactory.newInstance (); factory.setNamespaceAware (true); //factory.setValidating (true); SAXParser parser = factory.newSAXParser (); FileReader r = new FileReader(xml_file); InputSource iSource = new InputSource(r); XMLReader reader = parser.getXMLReader (); reader.setContentHandler(new DefaultHandler()); reader.setErrorHandler(new DefaultHandler()); reader.parse(iSource); } catch (FactoryConfigurationError e) { validation_msg = "unable to get a document builder factory"; } catch (ParserConfigurationException e) { validation_msg = "unable to configure parser"; } catch (SAXParseException e) { validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage (); } catch (SAXException e) { validation_msg += " Fatal error: " + e.toString (); } catch (IOException e) { validation_msg = "Unable to read the input, i/o error"; } return validation_msg; } /** Returns a string of the location. */ private static String getLocationString(SAXParseException ex) { StringBuffer str = new StringBuffer(); String systemId = ex.getSystemId(); if (systemId != null) { int index = systemId.lastIndexOf('/'); if (index != -1) systemId = systemId.substring(index + 1); str.append(systemId); } str.append("(line "); str.append(ex.getLineNumber()-1); str.append(", column "); str.append(ex.getColumnNumber()); str.append("): "); return str.toString(); } // getLocationString(SAXParseException):String /** Parse an XML document from a given file path */ static public Document parseXMLFile (String xml_file_path, boolean use_class_loader) { if (use_class_loader == true) { InputStream is = JarTools.getResourceAsStream ("/" + xml_file_path); if (is != null) { return parseXML (is); } } // Try the file outside the classes directory return parseXMLFile (new File (xml_file_path)); } /** Parse an XML document from a given file */ static public Document parseXMLFile (File xml_file) { // No file? No point trying! if (xml_file.exists () == false) { return null; } try { return parseXML (new FileInputStream (xml_file)); } catch (Exception exception) { DebugStream.printStackTrace (exception); return null; } } /** Parse an XML document from a given input stream */ static public Document parseXML (InputStream xml_input_stream) { Document document = null; try { InputStreamReader isr = new InputStreamReader (xml_input_stream, "UTF-8"); document = parseXML(isr); isr.close (); xml_input_stream.close (); } catch (Exception exception) { DebugStream.printStackTrace (exception); } return document; } /** Parse an XML document from a given reader */ static public Document parseXML (Reader xml_reader) { Document document = null; // If debugging, the following will store the XML contents to be parsed, // which can then be inspected upon encountering a SAXException (need to run GLI with -debug on) String xmlContents = ""; try { Reader reader = null; // (1) By default, GLI will remove any contents preceeding (and invalidating) // the XML and present these lines separately to the user if(!DebugStream.isDebuggingEnabled()) { try { reader = new BufferedReader( new RemoveContentBeforeRootElementXMLReader(xml_reader) ); } catch ( Exception e ) { System.err.println( "Exception while wrapping the reader in parseXML(Reader)" ); e.printStackTrace(); } } // (2) If we are running GLI in debug mode: // In case parsing exceptions are thrown (SAX Exceptions), we want to get some // idea of where things went wrong. This will print the "XML" contents to either // system.out (if debugging is off) or to the DebugStream otherwise. // We need to read the XML twice to know the line where things went wrong, so // do the additional reading only if we're debugging else { StringBuffer buf = new StringBuffer(); char[] buffer = new char[500]; int numCharsRead = xml_reader.read(buffer, 0, buffer.length); while(numCharsRead != -1) { buf.append(buffer, 0, numCharsRead); numCharsRead = xml_reader.read(buffer, 0, buffer.length); } xmlContents = buf.toString(); xml_reader.close(); // closing the old Reader xml_reader = null; buffer = null; buf = null; // we need a Reader to parse the same contents as the Reader that was just closed reader = new BufferedReader(new StringReader(xmlContents)); } // (2) The actual XML parsing InputSource isc = new InputSource (reader); DOMParser parser = new DOMParser (); parser.setFeature ("http://xml.org/sax/features/validation", false); parser.setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster. parser.setFeature ("http://apache.org/xml/features/dom/defer-node-expansion", true); parser.setFeature ("http://apache.org/xml/features/dom/include-ignorable-whitespace", false); parser.parse (isc); document = parser.getDocument (); } catch(SAXParseException e) { showXMLParseFailureLine(e, xmlContents); } catch (SAXException exception) { System.err.println ("SAX exception: " + exception.getMessage ()); if(DebugStream.isDebuggingEnabled()) { DebugStream.println("Encountered a SAX exception when parsing the following:\n*********START\n" + xmlContents + "\n************END\n"); // Exit to let the user view the erroneous line/xml before it goes past the screen buffer? DebugStream.println("Debug mode: Exiting the program as there was trouble parsing the XML..."); System.exit(-1); } // else, not running in debug mode, so don't exit after exception System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents that could not be parsed."); DebugStream.printStackTrace (exception); } catch (Exception exception) { DebugStream.printStackTrace (exception); } return document; } /** Displays the line (string) where the SAXParseException occurred, given a String of the * entire xml that was being parsed and the SAXParseException object that was caught. * The messages are printed to DebugStream, so run GLI/FLI with -debug to view this output. * @param xmlContents is the entire xml that was being parsed when the exception occurred * @param e is the SAXParseException object that was thrown upon parsing the xmlContents. */ public static void showXMLParseFailureLine(SAXParseException e, String xmlContents) { // There should be no characters at all that preceed the ... bit. // The first check is for starting spaces: if(xmlContents.startsWith("\n") || xmlContents.startsWith(" ") || xmlContents.startsWith("\t")) { DebugStream.println("ERROR: illegal start of XML. Space/tab/newline should not preceed xml declaration.\n"); DebugStream.println("xmlContents (length is " + xmlContents.length() + "):\n" + xmlContents); return; // nothing more to do, first error identified } // the actual line (String literal) where parsing failed and the SAXParseException occurred. String line = ""; int linenumber = e.getLineNumber(); DebugStream.println("\n****SAXParseException on LINE NUMBER: " + linenumber); if(DebugStream.isDebuggingEnabled()) { if(linenumber != -1) { // find the line in xmlContents string (xmlContents is only set if GLI is run with debugging turned on) int start = 0; int end = xmlContents.length(); for(int i = 1; i <= linenumber; i++) { end = xmlContents.indexOf("\n"); if(end > 0) { line = xmlContents.substring(start, end); } start = end+1; } DebugStream.println("The parsing error occurred on this line:\n***********START\n" + line + "\n***********END"); DebugStream.println("SAXParseException message: " + e.getMessage() + "\n"); } else { // no particular line number, print out all the xml so debugger can inspect it DebugStream.println("Encountered a SAX exception when parsing the following:\n*********START\n" + xmlContents + "\n************END\n"); } // Exit to let the user view the erroneous line/xml before it goes past the screen buffer? DebugStream.println("\nDebug mode: Exiting the program as there was trouble parsing the XML..."); System.exit(-1); } else { // not running in debug mode System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents/line that could not be parsed."); } } static public StringBuffer readXMLStream (InputStream input_stream) { StringBuffer xml = new StringBuffer (""); try { InputStreamReader isr = new InputStreamReader (input_stream, "UTF-8"); BufferedReader buffered_in = new BufferedReader (isr); String line = ""; boolean xml_content = false; while((line = buffered_in.readLine ()) != null) { if(xml_content) { xml.append (line); xml.append ("\n"); } else if(line.trim ().startsWith ("= 0x20 && character <= 0xD7FF) || character == 0x09 || character == 0x0A || character == 0x0D || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF)) { safe_characters[j] = character; j++; } } return new String (safe_characters, 0, j); } static public void setElementTextValue (Element element, String text) { // Remove all text node children NodeList children_nodelist = element.getChildNodes (); for (int i = children_nodelist.getLength () - 1; i >= 0; i--) { Node child_node = children_nodelist.item (i); if (child_node.getNodeType () == Node.TEXT_NODE) { element.removeChild (child_node); } } // Add a new text node if (text != null) { element.appendChild (element.getOwnerDocument ().createTextNode (text)); } } /** Set the #text node value of some element. * @param element the Element whose value we wish to set * @param value the new value for the element as a String * Soon to be deprecated! */ static final public void setValue (Element element, String value) { // Remove any existing child node(s) clear (element); // Add new text node. if (value != null) { element.appendChild (element.getOwnerDocument ().createTextNode (value)); } } /** Write an XML document to a given file with the text node of the specified element unescaped*/ static public void writeXMLFile (File xml_file, Document document, String[] nonEscapingTagNames) { try { OutputStream os = new FileOutputStream (xml_file); // Create an output format for our document. OutputFormat f = new OutputFormat (document); f.setEncoding ("UTF-8"); f.setIndenting (true); f.setLineWidth (0); // Why isn't this working! f.setPreserveSpace (false); if (nonEscapingTagNames != null) { f.setNonEscapingElements (nonEscapingTagNames); } // Create the necessary writer stream for serialization. OutputStreamWriter osw = new OutputStreamWriter (os, "UTF-8"); Writer w = new BufferedWriter (osw); // Generate a new serializer from the above. XMLSerializer s = new XMLSerializer (w, f); s.asDOMSerializer (); // Finally serialize the document to file. s.serialize (document); // And close. os.close (); } catch (Exception exception) { DebugStream.printStackTrace (exception); } } /** Write an XML document to a given file */ static public void writeXMLFile (File xml_file, Document document) { writeXMLFile(xml_file, document, null); } public static void printXMLNode (Node e) { printXMLNode (e, 0) ; } public static void printXMLNode (Node e, int depth) { //recursive method call using DOM API... for (int i=0 ; i") ; else { System.out.println ('>') ; int len = children.getLength (); for (int i = 0; i < len; i++) { printXMLNode (children.item (i), depth + 1); } for (int i=0 ; i"); } } public static String xmlNodeToString (Node e){ StringBuffer sb = new StringBuffer (""); xmlNodeToString (sb,e,0); return sb.toString (); } private static void xmlNodeToString (StringBuffer sb, Node e, int depth){ for (int i=0 ; i\n") ; else { sb.append (">\n") ; int len = children.getLength (); for (int i = 0; i < len; i++) { xmlNodeToString (sb,children.item (i), depth + 1); } for (int i=0 ; i\n"); } } public static String xmlNodeToStringWithoutIndenting (Node e) { StringBuffer sb = new StringBuffer (""); xmlNodeToStringWithoutNewline(sb, e, -1); return sb.toString(); } public static String xmlNodeToStringWithoutNewline (Node e){ StringBuffer sb = new StringBuffer (""); xmlNodeToStringWithoutNewline (sb,e,0); return sb.toString (); } private static void xmlNodeToStringWithoutNewline (StringBuffer sb, Node e, int depth){ for (int i=0 ; i") ; else { sb.append (">") ; int len = children.getLength (); for (int i = 0; i < len; i++) { if (depth >= 0) { xmlNodeToStringWithoutNewline (sb,children.item (i), depth + 1); } else { xmlNodeToStringWithoutNewline (sb,children.item (i), depth); } } for (int i=0 ; i"); } } }