source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 24393

Last change on this file since 24393 was 23791, checked in by davidb, 13 years ago

Changes to take advantage of the more efficient Apache Commons string manipulation classes.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.3 KB
RevLine 
[3235]1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
[3222]19package org.greenstone.gsdl3.util;
20
21// XML classes
[18434]22import org.w3c.dom.DOMImplementation;
[3222]23import org.w3c.dom.Document;
[16374]24import org.w3c.dom.DocumentType;
[16688]25import org.w3c.dom.Element;
[3222]26import org.w3c.dom.Node;
[3768]27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
[3222]29import org.xml.sax.InputSource;
[5187]30import org.xml.sax.EntityResolver;
[18434]31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
[3222]33import org.apache.xerces.parsers.DOMParser;
[18434]34import org.apache.xerces.dom.*; // for new Documents
[3222]35
36// other java classes
37import java.io.Reader;
[4087]38import java.io.InputStreamReader;
[3222]39import java.io.StringReader;
40import java.io.File;
[4087]41import java.io.FileInputStream;
[3222]42import java.io.FileReader;
[23791]43import java.util.regex.*;
[3222]44
[13124]45import org.apache.log4j.*;
46
[23791]47// Apache Commons
48import org.apache.commons.lang3.*;
49
50import java.util.*;
51import java.lang.reflect.*;
52
[5187]53/** XMLConverter - utility class for greenstone
[3222]54 *
55 * parses XML Strings into Documents, converts Nodes to Strings
56 * different parsers have different behaviour - can experiment in here
[3941]57 * now we only use xerces
[3222]58 *
59 */
60public class XMLConverter {
61
[13210]62 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
[13124]63
[3222]64 /** xerces parser */
[5187]65 protected DOMParser parser = null;
[3222]66
[18307]67 private static boolean outputEscaping = true;
[3768]68
[3235]69 /** the no-args constructor */
[3222]70 public XMLConverter() {
71 try {
[5187]72 this.parser = new DOMParser();
73 this.parser.setFeature("http://xml.org/sax/features/validation", false);
[13210]74 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
75 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
[10067]76 // a performance test showed that having this on lead to increased
77 // memory use for small-medium docs, and not much gain for large
78 // docs.
79 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
80 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
[18434]81 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
82 // this errormessage can then be converted to xhtml and displayed in a browser.
83 this.parser.setErrorHandler(new ParseErrorHandler());
[3222]84 } catch (Exception e) {
[13210]85 logger.error(e.getMessage());
[3222]86 }
87 }
88
[5187]89 /** sets the entity resolver. pass in null to unset it */
90 public void setEntityResolver(EntityResolver er) {
91 this.parser.setEntityResolver(er);
92 }
[16688]93
[16780]94 /** Given a Node representing an Element or Document, will return the
95 * Element/docroot Element. Returns null if the Node was not an element. */
96 public static Element nodeToElement(Node node)
[16688]97 {
[16999]98 if(node == null) {
99 return null;
100 }
[16688]101 short nodeType = node.getNodeType();
102
103 if (nodeType == Node.DOCUMENT_NODE) {
104 Document docNode = (Document)node;
105 return docNode.getDocumentElement() ;
106 }
107 else if (nodeType == Node.ELEMENT_NODE) {
108 return (Element)node;
109 }
110 else {
[16780]111 String message = "Expecting Document or Element node type but got "
112 + node.getNodeName() + "\nReturning null";
113 System.err.println(message);
114 logger.warn(message);
[16688]115 return null;
116 }
117 }
118
[3222]119 /** returns a DOM Document */
120 public Document getDOM(String in) {
121
122 try {
123 Reader reader = new StringReader(in);
124 InputSource xml_source = new InputSource(reader);
125
[5187]126 this.parser.parse(xml_source);
127 Document doc = this.parser.getDocument();
[10202]128
[3222]129 return doc;
[4087]130
[3222]131 } catch (Exception e) {
[13210]132 logger.error(e.getMessage());
[3222]133 }
134 return null;
135 }
[3235]136
137 /** returns a DOM Document */
[3222]138 public Document getDOM(File in) {
139 try {
[4087]140 FileReader reader = new FileReader(in);
[3222]141 InputSource xml_source = new InputSource(reader);
[5187]142 this.parser.parse(xml_source);
143 Document doc = this.parser.getDocument();
[4087]144 return doc;
145
146 } catch (Exception e) {
[13210]147 logger.error(e.getMessage(), e);
148
[4087]149 }
150 return null;
151 }
152
153 /** returns a DOM document */
154 public Document getDOM(File in, String encoding) {
155 try {
156
157 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
158 InputSource xml_source = new InputSource(isr);
159
[5187]160 this.parser.parse(xml_source);
161 Document doc = this.parser.getDocument();
[3222]162
163 return doc;
164
165 } catch (Exception e) {
[13210]166 logger.error(e.getMessage());
[3222]167 }
168 return null;
169 }
[4087]170
[3222]171
172 /** creates a new empty DOM Document */
[18307]173 public static Document newDOM() {
[3941]174 Document doc = new DocumentImpl();
[3222]175 return doc;
176 }
[18434]177
178 /**
179 * This method's parameters represent the parts of the Doctype of this
180 * Document that is to be created.
181 * For more info see
182 * http://xerces.apache.org/xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl.html#DocumentTypeImpl(org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
183 *
184 * */
185 public static Document newDOM(String qualifiedName, String publicID, String systemID) {
186 // create empty DOM document
187 DocumentImpl docImpl = new DocumentImpl();
188
189 // Need to use the document to create the docType for it
190 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
191
192 // Although we have created the docType using the document, we need to still
193 // put it into the empty document we just created
194 try{
195 docImpl.appendChild(myDocType);
196 } catch(Exception e) {
197 System.out.println("Could not append docType because: " + e) ;
198 }
199
200 // return the document containing a DocType
201 return docImpl;
202 }
[3222]203
204 /** returns the Node as a String */
[18307]205 public static String getString(Node xmlNode)
[3768]206 {
207 outputEscaping = true;
[10202]208 StringBuffer xmlRepresentation = new StringBuffer();
209 getString(xmlNode, xmlRepresentation, 0, false);
210 return xmlRepresentation.toString();
[3768]211 }
212
[3970]213 /** returns the node as a nicely formatted String - this introduces extra
214 * text nodes if the String is read back in as a DOM, so should only be
215 * used for printing */
[18307]216 public static String getPrettyString(Node xmlNode) {
[3970]217
218 outputEscaping = true;
[10202]219 StringBuffer xmlRepresentation = new StringBuffer();
220 getString(xmlNode, xmlRepresentation, 0, true);
221 return xmlRepresentation.toString();
[3970]222 }
[3768]223
[23791]224 /* For the purposes of logger.debug statements, where this is called and hence outputted,
225 returns an empty string if debugging is not enabled */
226 public static String getPrettyStringLogger(Node xmlNode, Logger log) {
227
228 if(log.isDebugEnabled())
229 return getPrettyString(xmlNode);
230
231 return "";
232
233 }
234
[18307]235 private static void getString(Node xmlNode, StringBuffer xmlRepresentation,
[10202]236 int depth, boolean pretty)
[3768]237 {
[10202]238
239 if (xmlNode == null) {
240 xmlRepresentation.append("<null>");
241 return;
242 }
[3768]243
244 short nodeType = xmlNode.getNodeType();
245 String nodeName = xmlNode.getNodeName();
246
[9874]247 if (nodeType == Node.DOCUMENT_NODE) {
[16374]248 Document xmlDocNode = (Document)xmlNode;
249
[18434]250 //if (xmlDocNode.getDoctype() == null) {
251 //System.err.println("Doctype is null.");
252 //}
253 //else {
254 if (xmlDocNode.getDoctype() != null) {
[16374]255 DocumentType dt = xmlDocNode.getDoctype();
256
257 String name = dt.getName();
258 String pid = dt.getPublicId();
259 String sid = dt.getSystemId();
260
[23791]261 // Use previously assigned name, not dt.getName() again
262 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
[16374]263
264 xmlRepresentation.append(doctype_str);
265 }
266 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
[10202]267 return;
[9874]268 }
[3768]269 // Handle Element nodes
270 if (nodeType == Node.ELEMENT_NODE) {
[3970]271 if (pretty) {
[10202]272 xmlRepresentation.append("\n");
[3970]273 for (int i = 0; i < depth; i++) {
[10202]274 xmlRepresentation.append(" ");
[3970]275 }
276 }
[3768]277
278 // Write opening tag
[10202]279 xmlRepresentation.append("<");
280 xmlRepresentation.append(nodeName);
[3970]281
[3768]282 // Write the node attributes
283 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
284 for (int i = 0; i < nodeAttributes.getLength(); i++) {
285 Node attribute = nodeAttributes.item(i);
[10202]286 xmlRepresentation.append(" ");
287 xmlRepresentation.append(attribute.getNodeName());
288 xmlRepresentation.append("=\"");
289 xmlRepresentation.append(attribute.getNodeValue());
290 xmlRepresentation.append("\"");
[3768]291 }
292
293 // If the node has no children, close the opening tag and return
294 if (xmlNode.hasChildNodes() == false) {
295 // This produces somewhat ugly output, but it is necessary to compensate
296 // for display bugs in Netscape. Firstly, the space is needed before the
297 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
298 // example). Also, a newline character would be expected after the tag,
299 // but this causes problems with the display of links (the link text
300 // will contain a newline character, which is displayed badly).
[10202]301 xmlRepresentation.append(" />");
302 return;
[3768]303 }
[3970]304
[3768]305 // Close the opening tag
[10202]306 xmlRepresentation.append(">");
[3970]307
[3768]308 // Apply recursively to the children of this node
[4285]309 // hack for nodes next to text nodes - dont make them pretty
310 // this is needed for text inside a <pre> element - any new lines
311 // or spaces around the span elements show up in the text
[3768]312 NodeList children = xmlNode.getChildNodes();
[4285]313 boolean do_pretty = pretty;
[3768]314 for (int i = 0; i < children.getLength(); i++) {
[4285]315 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
316 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
317 }
[10202]318 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
[3768]319 }
[3970]320
[3768]321 // Write closing tag
[3970]322 if (pretty) {
[10202]323 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
[3970]324 for (int i = 0; i < depth; i++)
[10202]325 xmlRepresentation.append(" ");
[3970]326 }
[3768]327 }
[10202]328 xmlRepresentation.append("</");
329 xmlRepresentation.append(nodeName);
330 xmlRepresentation.append(">");
[3970]331 if (pretty) {
[10202]332 xmlRepresentation.append("\n");
[3970]333 }
[3501]334 }
[3970]335
[3768]336 // Handle Text nodes
337 else if (nodeType == Node.TEXT_NODE) {
338 String text = xmlNode.getNodeValue();
339
340 // Perform output escaping, if required
[23791]341 // Apache Commons replace method is far superior to String.replaceAll - very fast!
[3768]342 if (outputEscaping) {
[23791]343
344 text = StringUtils.replace(text, "&", "&amp;");
345 text = StringUtils.replace(text, "<", "&lt;");
346 text = StringUtils.replace(text, ">", "&gt;");
347 text = StringUtils.replace(text, "'", "&apos;");
348 text = StringUtils.replace(text, "\"", "&quot;");
[3768]349 }
350
351 // Remove any control-C characters
[23791]352 text = StringUtils.replace(text, "" + (char)3, "");
353
[10202]354 xmlRepresentation.append(text);
[3768]355 }
356
357 // Handle Processing Instruction nodes
358 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
[4013]359 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
[3768]360 outputEscaping = false;
361 }
[4013]362 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
[3768]363 outputEscaping = true;
364 }
365 else {
[13210]366 logger.warn("Unhandled processing instruction " + nodeName);
[3768]367 }
368 }
[8923]369
370 else if (nodeType == Node.COMMENT_NODE) {
371 String text = xmlNode.getNodeValue();
[10202]372 xmlRepresentation.append("<!-- ");
373 xmlRepresentation.append(text);
374 xmlRepresentation.append(" -->");
[8923]375 }
376
377
378
[3768]379 // A type of node that is not handled yet
380 else {
[13210]381 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
[3768]382 }
383
[10202]384 return;
[3222]385 }
[3908]386
387 protected static String getNodeTypeString(short node_type) {
388
389 String type = "";
390 switch(node_type) {
391 case Node.ATTRIBUTE_NODE:
392 type="ATTRIBUTE_NODE";
393 break;
394 case Node.CDATA_SECTION_NODE:
395 type="CDATA_SECTION_NODE";
396 break;
397 case Node.COMMENT_NODE:
398 type="COMMENT_NODE";
399 break;
400 case Node.DOCUMENT_FRAGMENT_NODE:
401 type="DOCUMENT_FRAGMENT_NODE";
402 break;
403 case Node.DOCUMENT_NODE:
404 type="DOCUMENT_NODE";
405 break;
406 case Node.DOCUMENT_TYPE_NODE:
407 type="DOCUMENT_TYPE_NODE";
408 break;
409 case Node.ELEMENT_NODE:
410 type="ELEMENT_NODE";
411 break;
412 case Node.ENTITY_NODE:
413 type="ENTITY_NODE";
414 break;
415 case Node.ENTITY_REFERENCE_NODE:
416 type="ENTITY_REFERENCE_NODE";
417 break;
418 case Node.NOTATION_NODE:
419 type="NOTATION_NODE";
420 break;
421 case Node.PROCESSING_INSTRUCTION_NODE:
422 type="PROCESSING_INSTRUCTION_NODE";
423 break;
424 case Node.TEXT_NODE:
425 type="TEXT_NODE";
426 break;
427 default:
428 type="UNKNOWN";
429 }
430
431 return type;
432 }
[18434]433
434 // returns null if there no error occurred during parsing, or else returns the error message
435 public String getParseErrorMessage() {
436 ParseErrorHandler errorHandler = (ParseErrorHandler)this.parser.getErrorHandler();
437 return errorHandler.getErrorMessage();
438 }
439
440 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
441 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
442 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
443 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
444 static public class ParseErrorHandler implements ErrorHandler {
445 protected String errorMessage = null;
446
447 // Receive notification of a recoverable error.
448 public void error(SAXParseException exception) {
449 handleError("Error:\n", exception);
450 }
451 // Receive notification of a non-recoverable error.
452 public void fatalError(SAXParseException exception) {
453 handleError("Fatal Error:\n", exception);
454 }
455 // Receive notification of a warning.
456 public void warning(SAXParseException exception) {
457 handleError("Warning:\n", exception);
458 }
459
460 public String toString(SAXParseException e) {
461 String msg = e.getMessage();
462 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
463 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
464 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
465
466 return msg;
467 }
468
469 // clears the errorPage variable after first call to this method
470 public String getErrorMessage() {
471 String errMsg = this.errorMessage;
472 if(this.errorMessage != null) {
473 this.errorMessage = null;
474 }
475 return errMsg;
476 }
477
478 // sets the errorMessage member variable to the data stored in the exception
479 // and writes the errorMessage to the logger and tomcat's System.err
480 protected void handleError(String errorType, SAXParseException exception) {
481 this.errorMessage = errorType + toString(exception);
482 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
483 logger.error(this.errorMessage);
484 }
485 }
[3222]486}
Note: See TracBrowser for help on using the repository browser.