source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 26026

Last change on this file since 26026 was 25655, checked in by sjm84, 12 years ago

Added a version of getDOM that takes an encoding (not sure if this is even useful though)

  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
RevLine 
[3235]1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
[3222]19package org.greenstone.gsdl3.util;
20
21// XML classes
[18434]22import org.w3c.dom.DOMImplementation;
[24862]23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
[3768]27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
[3222]29import org.xml.sax.InputSource;
[5187]30import org.xml.sax.EntityResolver;
[18434]31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
[3222]33import org.apache.xerces.parsers.DOMParser;
[18434]34import org.apache.xerces.dom.*; // for new Documents
[3222]35
36// other java classes
[25655]37import java.io.ByteArrayInputStream;
38import java.io.InputStream;
[3222]39import java.io.Reader;
[4087]40import java.io.InputStreamReader;
[3222]41import java.io.StringReader;
42import java.io.File;
[4087]43import java.io.FileInputStream;
[3222]44import java.io.FileReader;
[23791]45import java.util.regex.*;
[3222]46
[13124]47import org.apache.log4j.*;
48
[23791]49// Apache Commons
50import org.apache.commons.lang3.*;
51
52import java.util.*;
53import java.lang.reflect.*;
54
[24862]55/**
56 * XMLConverter - utility class for greenstone
57 *
58 * parses XML Strings into Documents, converts Nodes to Strings different
59 * parsers have different behaviour - can experiment in here now we only use
60 * xerces
61 *
[3222]62 */
[24862]63public class XMLConverter
64{
[3222]65
[24862]66 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
[13124]67
[24863]68 protected EntityResolver resolver = null;
69
[24862]70 /** xerces parser */
71 protected DOMParser parser = null;
[3222]72
[24862]73 private static boolean outputEscaping = true;
[3768]74
[24862]75 /** the no-args constructor */
76 public XMLConverter()
77 {
78 try
79 {
80 this.parser = new DOMParser();
81 this.parser.setFeature("http://xml.org/sax/features/validation", false);
82 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
83 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
84 // a performance test showed that having this on lead to increased
85 // memory use for small-medium docs, and not much gain for large
86 // docs.
87 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
88 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
89 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
90 // this errormessage can then be converted to xhtml and displayed in a browser.
91 this.parser.setErrorHandler(new ParseErrorHandler());
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 }
[3222]97 }
98
[24862]99 /** sets the entity resolver. pass in null to unset it */
100 public void setEntityResolver(EntityResolver er)
101 {
[24863]102 this.resolver = er;
[24862]103 this.parser.setEntityResolver(er);
104 }
[16688]105
[24862]106 /**
107 * Given a Node representing an Element or Document, will return the
108 * Element/docroot Element. Returns null if the Node was not an element.
109 */
110 public static Element nodeToElement(Node node)
111 {
112 if (node == null)
113 {
114 return null;
115 }
116 short nodeType = node.getNodeType();
117
118 if (nodeType == Node.DOCUMENT_NODE)
119 {
120 Document docNode = (Document) node;
121 return docNode.getDocumentElement();
122 }
123 else if (nodeType == Node.ELEMENT_NODE)
124 {
125 return (Element) node;
126 }
127 else
128 {
129 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
130 System.err.println(message);
131 logger.warn(message);
132 return null;
133 }
[16999]134 }
[16688]135
[24862]136 /** returns a DOM Document */
137 public Document getDOM(String in)
138 {
139
140 try
141 {
142 Reader reader = new StringReader(in);
143 InputSource xml_source = new InputSource(reader);
144
[24863]145 DOMParser parser = new DOMParser();
146 parser.setFeature("http://xml.org/sax/features/validation", false);
147 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
148 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
149 if(resolver != null)
150 {
151 parser.setEntityResolver(this.resolver);
152 }
153 parser.setErrorHandler(new ParseErrorHandler());
154 parser.parse(xml_source);
155
156 Document doc = parser.getDocument();
[24862]157
158 return doc;
159
160 }
161 catch (Exception e)
162 {
163 logger.error(e.getMessage());
164 }
165 return null;
[16688]166 }
[24862]167
168 /** returns a DOM Document */
[25655]169 public Document getDOM(String in, String encoding)
170 {
171 try
172 {
173 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
174 InputSource xml_source = new InputSource(reader);
175
176 DOMParser parser = new DOMParser();
177 parser.setFeature("http://xml.org/sax/features/validation", false);
178 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
179 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
180 if(resolver != null)
181 {
182 parser.setEntityResolver(this.resolver);
183 }
184 parser.setErrorHandler(new ParseErrorHandler());
185 parser.parse(xml_source);
186
187 Document doc = parser.getDocument();
188
189 return doc;
190
191 }
192 catch (Exception e)
193 {
194 logger.error(e.getMessage());
195 }
196 return null;
197 }
198
199 /** returns a DOM Document */
[24862]200 public Document getDOM(File in)
201 {
202 try
203 {
204 FileReader reader = new FileReader(in);
205 InputSource xml_source = new InputSource(reader);
[24863]206
207 DOMParser parser = new DOMParser();
208 parser.setFeature("http://xml.org/sax/features/validation", false);
209 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
210 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
211 if(resolver != null)
212 {
213 parser.setEntityResolver(this.resolver);
214 }
215 parser.setErrorHandler(new ParseErrorHandler());
216 parser.parse(xml_source);
217
218 Document doc = parser.getDocument();
[24862]219 return doc;
220
221 }
222 catch (Exception e)
223 {
224 logger.error(e.getMessage(), e);
225
226 }
227 return null;
[16688]228 }
229
[24862]230 /** returns a DOM document */
231 public Document getDOM(File in, String encoding)
232 {
233 try
234 {
235
236 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
237 InputSource xml_source = new InputSource(isr);
238
[24863]239 DOMParser parser = new DOMParser();
240 parser.setFeature("http://xml.org/sax/features/validation", false);
241 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
242 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
243 if(resolver != null)
244 {
245 parser.setEntityResolver(this.resolver);
246 }
247 parser.setErrorHandler(new ParseErrorHandler());
248 parser.parse(xml_source);
249
250 Document doc = parser.getDocument();
[24862]251
252 return doc;
253
254 }
255 catch (Exception e)
256 {
257 logger.error(e.getMessage());
258 }
259 return null;
[3222]260 }
[4087]261
[24862]262 /** creates a new empty DOM Document */
263 public static Document newDOM()
264 {
265 Document doc = new DocumentImpl();
266 return doc;
[4087]267 }
268
[24862]269 /**
270 * This method's parameters represent the parts of the Doctype of this
271 * Document that is to be created. For more info see
272 * http://xerces.apache.org
273 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
274 * .html#DocumentTypeImpl
275 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
276 *
277 * */
278 public static Document newDOM(String qualifiedName, String publicID, String systemID)
279 {
280 // create empty DOM document
281 DocumentImpl docImpl = new DocumentImpl();
[3222]282
[24862]283 // Need to use the document to create the docType for it
284 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
[3222]285
[18434]286 // Although we have created the docType using the document, we need to still
287 // put it into the empty document we just created
[24862]288 try
289 {
[18434]290 docImpl.appendChild(myDocType);
291 }
[24862]292 catch (Exception e)
293 {
294 System.out.println("Could not append docType because: " + e);
295 }
296
[18434]297 // return the document containing a DocType
[24862]298 return docImpl;
299 }
[3222]300
[24862]301 /** returns the Node as a String */
302 public static String getString(Node xmlNode)
303 {
304 outputEscaping = true;
305 StringBuffer xmlRepresentation = new StringBuffer();
306 getString(xmlNode, xmlRepresentation, 0, false);
307 return xmlRepresentation.toString();
308 }
[3768]309
[24862]310 /**
311 * returns the node as a nicely formatted String - this introduces extra
312 * text nodes if the String is read back in as a DOM, so should only be used
313 * for printing
314 */
315 public static String getPrettyString(Node xmlNode)
316 {
[3768]317
[24862]318 outputEscaping = true;
319 StringBuffer xmlRepresentation = new StringBuffer();
320 getString(xmlNode, xmlRepresentation, 0, true);
321 return xmlRepresentation.toString();
322 }
[23791]323
[24862]324 /*
325 * For the purposes of logger.debug statements, where this is called and
326 * hence outputted, returns an empty string if debugging is not enabled
327 */
328 public static String getPrettyStringLogger(Node xmlNode, Logger log)
329 {
[23791]330
[24862]331 if (log.isDebugEnabled())
332 return getPrettyString(xmlNode);
[23791]333
[24862]334 return "";
335
[10202]336 }
[3768]337
[24862]338 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
339 {
[3768]340
[24862]341 if (xmlNode == null)
342 {
343 xmlRepresentation.append("<null>");
344 return;
[3970]345 }
[3768]346
[24862]347 short nodeType = xmlNode.getNodeType();
348 String nodeName = xmlNode.getNodeName();
[3768]349
[24862]350 if (nodeType == Node.DOCUMENT_NODE)
351 {
352 Document xmlDocNode = (Document) xmlNode;
353
354 //if (xmlDocNode.getDoctype() == null) {
355 //System.err.println("Doctype is null.");
356 //}
357 //else {
358 if (xmlDocNode.getDoctype() != null)
359 {
360 DocumentType dt = xmlDocNode.getDoctype();
361
362 String name = dt.getName();
363 String pid = dt.getPublicId();
364 String sid = dt.getSystemId();
365
366 // Use previously assigned name, not dt.getName() again
367 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
368
369 xmlRepresentation.append(doctype_str);
370 }
371 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
372 return;
[4285]373 }
[24862]374 // Handle Element nodes
375 if (nodeType == Node.ELEMENT_NODE)
376 {
377 if (pretty)
378 {
379 xmlRepresentation.append("\n");
380 for (int i = 0; i < depth; i++)
381 {
382 xmlRepresentation.append(" ");
383 }
384 }
385
386 // Write opening tag
387 xmlRepresentation.append("<");
388 xmlRepresentation.append(nodeName);
389
390 // Write the node attributes
391 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
392 for (int i = 0; i < nodeAttributes.getLength(); i++)
393 {
394 Node attribute = nodeAttributes.item(i);
395 xmlRepresentation.append(" ");
396 xmlRepresentation.append(attribute.getNodeName());
397 xmlRepresentation.append("=\"");
398 xmlRepresentation.append(attribute.getNodeValue());
399 xmlRepresentation.append("\"");
400 }
401
402 // If the node has no children, close the opening tag and return
403 if (xmlNode.hasChildNodes() == false)
404 {
405 // This produces somewhat ugly output, but it is necessary to compensate
406 // for display bugs in Netscape. Firstly, the space is needed before the
407 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
408 // example). Also, a newline character would be expected after the tag,
409 // but this causes problems with the display of links (the link text
410 // will contain a newline character, which is displayed badly).
411 xmlRepresentation.append(" />");
412 return;
413 }
414
415 // Close the opening tag
416 xmlRepresentation.append(">");
417
418 // Apply recursively to the children of this node
419 // hack for nodes next to text nodes - dont make them pretty
420 // this is needed for text inside a <pre> element - any new lines
421 // or spaces around the span elements show up in the text
422 NodeList children = xmlNode.getChildNodes();
423 boolean do_pretty = pretty;
424 for (int i = 0; i < children.getLength(); i++)
425 {
426 if (children.item(i).getNodeType() == Node.TEXT_NODE)
427 {
428 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
429 }
430 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
431 }
432
433 // Write closing tag
434 if (pretty)
435 {
436 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
437 {
438 for (int i = 0; i < depth; i++)
439 xmlRepresentation.append(" ");
440 }
441 }
442 xmlRepresentation.append("</");
443 xmlRepresentation.append(nodeName);
444 xmlRepresentation.append(">");
445 if (pretty)
446 {
447 xmlRepresentation.append("\n");
448 }
[3970]449 }
[3768]450
[24862]451 // Handle Text nodes
452 else if (nodeType == Node.TEXT_NODE)
453 {
454 String text = xmlNode.getNodeValue();
[23791]455
[24862]456 // Perform output escaping, if required
457 // Apache Commons replace method is far superior to String.replaceAll - very fast!
458 if (outputEscaping)
459 {
[3768]460
[24862]461 text = StringUtils.replace(text, "&", "&amp;");
462 text = StringUtils.replace(text, "<", "&lt;");
463 text = StringUtils.replace(text, ">", "&gt;");
464 text = StringUtils.replace(text, "'", "&apos;");
465 text = StringUtils.replace(text, "\"", "&quot;");
466 }
[23791]467
[24862]468 // Remove any control-C characters
469 text = StringUtils.replace(text, "" + (char) 3, "");
[3768]470
[24862]471 xmlRepresentation.append(text);
472 }
473
474 // Handle Processing Instruction nodes
475 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
476 {
477 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
478 {
479 outputEscaping = false;
480 }
481 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
482 {
483 outputEscaping = true;
484 }
485 else
486 {
487 logger.warn("Unhandled processing instruction " + nodeName);
488 }
489 }
490
491 else if (nodeType == Node.COMMENT_NODE)
492 {
493 String text = xmlNode.getNodeValue();
494 xmlRepresentation.append("<!-- ");
495 xmlRepresentation.append(text);
496 xmlRepresentation.append(" -->");
497 }
498
499 // A type of node that is not handled yet
500 else
501 {
502 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
503 }
504
505 return;
[3768]506 }
507
[24862]508 protected static String getNodeTypeString(short node_type)
509 {
[3908]510
[24862]511 String type = "";
512 switch (node_type)
513 {
514 case Node.ATTRIBUTE_NODE:
515 type = "ATTRIBUTE_NODE";
516 break;
517 case Node.CDATA_SECTION_NODE:
518 type = "CDATA_SECTION_NODE";
519 break;
520 case Node.COMMENT_NODE:
521 type = "COMMENT_NODE";
522 break;
523 case Node.DOCUMENT_FRAGMENT_NODE:
524 type = "DOCUMENT_FRAGMENT_NODE";
525 break;
526 case Node.DOCUMENT_NODE:
527 type = "DOCUMENT_NODE";
528 break;
529 case Node.DOCUMENT_TYPE_NODE:
530 type = "DOCUMENT_TYPE_NODE";
531 break;
532 case Node.ELEMENT_NODE:
533 type = "ELEMENT_NODE";
534 break;
535 case Node.ENTITY_NODE:
536 type = "ENTITY_NODE";
537 break;
538 case Node.ENTITY_REFERENCE_NODE:
539 type = "ENTITY_REFERENCE_NODE";
540 break;
541 case Node.NOTATION_NODE:
542 type = "NOTATION_NODE";
543 break;
544 case Node.PROCESSING_INSTRUCTION_NODE:
545 type = "PROCESSING_INSTRUCTION_NODE";
546 break;
547 case Node.TEXT_NODE:
548 type = "TEXT_NODE";
549 break;
550 default:
551 type = "UNKNOWN";
552 }
[3908]553
[24862]554 return type;
[3908]555 }
556
[18434]557 // returns null if there no error occurred during parsing, or else returns the error message
[24862]558 public String getParseErrorMessage()
559 {
560 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
[18434]561 return errorHandler.getErrorMessage();
562 }
[24862]563
[18434]564 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
565 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
566 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
567 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
[24862]568 static public class ParseErrorHandler implements ErrorHandler
569 {
[18434]570 protected String errorMessage = null;
[24862]571
[18434]572 // Receive notification of a recoverable error.
[24862]573 public void error(SAXParseException exception)
574 {
[18434]575 handleError("Error:\n", exception);
576 }
[24862]577
578 // Receive notification of a non-recoverable error.
579 public void fatalError(SAXParseException exception)
580 {
[18434]581 handleError("Fatal Error:\n", exception);
582 }
[24862]583
[18434]584 // Receive notification of a warning.
[24862]585 public void warning(SAXParseException exception)
586 {
[18434]587 handleError("Warning:\n", exception);
[24862]588 }
589
590 public String toString(SAXParseException e)
591 {
[18434]592 String msg = e.getMessage();
593 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
594 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
595 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
[24862]596
[18434]597 return msg;
598 }
[24862]599
[18434]600 // clears the errorPage variable after first call to this method
[24862]601 public String getErrorMessage()
602 {
[18434]603 String errMsg = this.errorMessage;
[24862]604 if (this.errorMessage != null)
605 {
[18434]606 this.errorMessage = null;
607 }
608 return errMsg;
609 }
[24862]610
[18434]611 // sets the errorMessage member variable to the data stored in the exception
612 // and writes the errorMessage to the logger and tomcat's System.err
[24862]613 protected void handleError(String errorType, SAXParseException exception)
614 {
615 this.errorMessage = errorType + toString(exception);
[18434]616 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
617 logger.error(this.errorMessage);
618 }
619 }
[3222]620}
Note: See TracBrowser for help on using the repository browser.