source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 25265

Last change on this file since 25265 was 24863, checked in by sjm84, 13 years ago

XMLConverter no longer keeps one converter around to reuse, it makes a new one every time, this is to avoid an error where the parser is called whilst it is already parsing

  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
RevLine 
[3235]1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
[3222]19package org.greenstone.gsdl3.util;
20
21// XML classes
[18434]22import org.w3c.dom.DOMImplementation;
[24862]23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
[3768]27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
[3222]29import org.xml.sax.InputSource;
[5187]30import org.xml.sax.EntityResolver;
[18434]31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
[3222]33import org.apache.xerces.parsers.DOMParser;
[18434]34import org.apache.xerces.dom.*; // for new Documents
[3222]35
36// other java classes
37import java.io.Reader;
[4087]38import java.io.InputStreamReader;
[3222]39import java.io.StringReader;
40import java.io.File;
[4087]41import java.io.FileInputStream;
[3222]42import java.io.FileReader;
[23791]43import java.util.regex.*;
[3222]44
[13124]45import org.apache.log4j.*;
46
[23791]47// Apache Commons
48import org.apache.commons.lang3.*;
49
50import java.util.*;
51import java.lang.reflect.*;
52
[24862]53/**
54 * XMLConverter - utility class for greenstone
55 *
56 * parses XML Strings into Documents, converts Nodes to Strings different
57 * parsers have different behaviour - can experiment in here now we only use
58 * xerces
59 *
[3222]60 */
[24862]61public class XMLConverter
62{
[3222]63
[24862]64 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
[13124]65
[24863]66 protected EntityResolver resolver = null;
67
[24862]68 /** xerces parser */
69 protected DOMParser parser = null;
[3222]70
[24862]71 private static boolean outputEscaping = true;
[3768]72
[24862]73 /** the no-args constructor */
74 public XMLConverter()
75 {
76 try
77 {
78 this.parser = new DOMParser();
79 this.parser.setFeature("http://xml.org/sax/features/validation", false);
80 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
81 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
82 // a performance test showed that having this on lead to increased
83 // memory use for small-medium docs, and not much gain for large
84 // docs.
85 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
86 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
87 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
88 // this errormessage can then be converted to xhtml and displayed in a browser.
89 this.parser.setErrorHandler(new ParseErrorHandler());
90 }
91 catch (Exception e)
92 {
93 logger.error(e.getMessage());
94 }
[3222]95 }
96
[24862]97 /** sets the entity resolver. pass in null to unset it */
98 public void setEntityResolver(EntityResolver er)
99 {
[24863]100 this.resolver = er;
[24862]101 this.parser.setEntityResolver(er);
102 }
[16688]103
[24862]104 /**
105 * Given a Node representing an Element or Document, will return the
106 * Element/docroot Element. Returns null if the Node was not an element.
107 */
108 public static Element nodeToElement(Node node)
109 {
110 if (node == null)
111 {
112 return null;
113 }
114 short nodeType = node.getNodeType();
115
116 if (nodeType == Node.DOCUMENT_NODE)
117 {
118 Document docNode = (Document) node;
119 return docNode.getDocumentElement();
120 }
121 else if (nodeType == Node.ELEMENT_NODE)
122 {
123 return (Element) node;
124 }
125 else
126 {
127 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
128 System.err.println(message);
129 logger.warn(message);
130 return null;
131 }
[16999]132 }
[16688]133
[24862]134 /** returns a DOM Document */
135 public Document getDOM(String in)
136 {
137
138 try
139 {
140 Reader reader = new StringReader(in);
141 InputSource xml_source = new InputSource(reader);
142
[24863]143 DOMParser parser = new DOMParser();
144 parser.setFeature("http://xml.org/sax/features/validation", false);
145 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
146 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
147 if(resolver != null)
148 {
149 parser.setEntityResolver(this.resolver);
150 }
151 parser.setErrorHandler(new ParseErrorHandler());
152 parser.parse(xml_source);
153
154 Document doc = parser.getDocument();
[24862]155
156 return doc;
157
158 }
159 catch (Exception e)
160 {
161 logger.error(e.getMessage());
162 }
163 return null;
[16688]164 }
[24862]165
166 /** returns a DOM Document */
167 public Document getDOM(File in)
168 {
169 try
170 {
171 FileReader reader = new FileReader(in);
172 InputSource xml_source = new InputSource(reader);
[24863]173
174 DOMParser parser = new DOMParser();
175 parser.setFeature("http://xml.org/sax/features/validation", false);
176 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
177 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
178 if(resolver != null)
179 {
180 parser.setEntityResolver(this.resolver);
181 }
182 parser.setErrorHandler(new ParseErrorHandler());
183 parser.parse(xml_source);
184
185 Document doc = parser.getDocument();
[24862]186 return doc;
187
188 }
189 catch (Exception e)
190 {
191 logger.error(e.getMessage(), e);
192
193 }
194 return null;
[16688]195 }
196
[24862]197 /** returns a DOM document */
198 public Document getDOM(File in, String encoding)
199 {
200 try
201 {
202
203 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
204 InputSource xml_source = new InputSource(isr);
205
[24863]206 DOMParser parser = new DOMParser();
207 parser.setFeature("http://xml.org/sax/features/validation", false);
208 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
209 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
210 if(resolver != null)
211 {
212 parser.setEntityResolver(this.resolver);
213 }
214 parser.setErrorHandler(new ParseErrorHandler());
215 parser.parse(xml_source);
216
217 Document doc = parser.getDocument();
[24862]218
219 return doc;
220
221 }
222 catch (Exception e)
223 {
224 logger.error(e.getMessage());
225 }
226 return null;
[3222]227 }
[4087]228
[24862]229 /** creates a new empty DOM Document */
230 public static Document newDOM()
231 {
232 Document doc = new DocumentImpl();
233 return doc;
[4087]234 }
235
[24862]236 /**
237 * This method's parameters represent the parts of the Doctype of this
238 * Document that is to be created. For more info see
239 * http://xerces.apache.org
240 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
241 * .html#DocumentTypeImpl
242 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
243 *
244 * */
245 public static Document newDOM(String qualifiedName, String publicID, String systemID)
246 {
247 // create empty DOM document
248 DocumentImpl docImpl = new DocumentImpl();
[3222]249
[24862]250 // Need to use the document to create the docType for it
251 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
[3222]252
[18434]253 // Although we have created the docType using the document, we need to still
254 // put it into the empty document we just created
[24862]255 try
256 {
[18434]257 docImpl.appendChild(myDocType);
258 }
[24862]259 catch (Exception e)
260 {
261 System.out.println("Could not append docType because: " + e);
262 }
263
[18434]264 // return the document containing a DocType
[24862]265 return docImpl;
266 }
[3222]267
[24862]268 /** returns the Node as a String */
269 public static String getString(Node xmlNode)
270 {
271 outputEscaping = true;
272 StringBuffer xmlRepresentation = new StringBuffer();
273 getString(xmlNode, xmlRepresentation, 0, false);
274 return xmlRepresentation.toString();
275 }
[3768]276
[24862]277 /**
278 * returns the node as a nicely formatted String - this introduces extra
279 * text nodes if the String is read back in as a DOM, so should only be used
280 * for printing
281 */
282 public static String getPrettyString(Node xmlNode)
283 {
[3768]284
[24862]285 outputEscaping = true;
286 StringBuffer xmlRepresentation = new StringBuffer();
287 getString(xmlNode, xmlRepresentation, 0, true);
288 return xmlRepresentation.toString();
289 }
[23791]290
[24862]291 /*
292 * For the purposes of logger.debug statements, where this is called and
293 * hence outputted, returns an empty string if debugging is not enabled
294 */
295 public static String getPrettyStringLogger(Node xmlNode, Logger log)
296 {
[23791]297
[24862]298 if (log.isDebugEnabled())
299 return getPrettyString(xmlNode);
[23791]300
[24862]301 return "";
302
[10202]303 }
[3768]304
[24862]305 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
306 {
[3768]307
[24862]308 if (xmlNode == null)
309 {
310 xmlRepresentation.append("<null>");
311 return;
[3970]312 }
[3768]313
[24862]314 short nodeType = xmlNode.getNodeType();
315 String nodeName = xmlNode.getNodeName();
[3768]316
[24862]317 if (nodeType == Node.DOCUMENT_NODE)
318 {
319 Document xmlDocNode = (Document) xmlNode;
320
321 //if (xmlDocNode.getDoctype() == null) {
322 //System.err.println("Doctype is null.");
323 //}
324 //else {
325 if (xmlDocNode.getDoctype() != null)
326 {
327 DocumentType dt = xmlDocNode.getDoctype();
328
329 String name = dt.getName();
330 String pid = dt.getPublicId();
331 String sid = dt.getSystemId();
332
333 // Use previously assigned name, not dt.getName() again
334 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
335
336 xmlRepresentation.append(doctype_str);
337 }
338 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
339 return;
[4285]340 }
[24862]341 // Handle Element nodes
342 if (nodeType == Node.ELEMENT_NODE)
343 {
344 if (pretty)
345 {
346 xmlRepresentation.append("\n");
347 for (int i = 0; i < depth; i++)
348 {
349 xmlRepresentation.append(" ");
350 }
351 }
352
353 // Write opening tag
354 xmlRepresentation.append("<");
355 xmlRepresentation.append(nodeName);
356
357 // Write the node attributes
358 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
359 for (int i = 0; i < nodeAttributes.getLength(); i++)
360 {
361 Node attribute = nodeAttributes.item(i);
362 xmlRepresentation.append(" ");
363 xmlRepresentation.append(attribute.getNodeName());
364 xmlRepresentation.append("=\"");
365 xmlRepresentation.append(attribute.getNodeValue());
366 xmlRepresentation.append("\"");
367 }
368
369 // If the node has no children, close the opening tag and return
370 if (xmlNode.hasChildNodes() == false)
371 {
372 // This produces somewhat ugly output, but it is necessary to compensate
373 // for display bugs in Netscape. Firstly, the space is needed before the
374 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
375 // example). Also, a newline character would be expected after the tag,
376 // but this causes problems with the display of links (the link text
377 // will contain a newline character, which is displayed badly).
378 xmlRepresentation.append(" />");
379 return;
380 }
381
382 // Close the opening tag
383 xmlRepresentation.append(">");
384
385 // Apply recursively to the children of this node
386 // hack for nodes next to text nodes - dont make them pretty
387 // this is needed for text inside a <pre> element - any new lines
388 // or spaces around the span elements show up in the text
389 NodeList children = xmlNode.getChildNodes();
390 boolean do_pretty = pretty;
391 for (int i = 0; i < children.getLength(); i++)
392 {
393 if (children.item(i).getNodeType() == Node.TEXT_NODE)
394 {
395 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
396 }
397 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
398 }
399
400 // Write closing tag
401 if (pretty)
402 {
403 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
404 {
405 for (int i = 0; i < depth; i++)
406 xmlRepresentation.append(" ");
407 }
408 }
409 xmlRepresentation.append("</");
410 xmlRepresentation.append(nodeName);
411 xmlRepresentation.append(">");
412 if (pretty)
413 {
414 xmlRepresentation.append("\n");
415 }
[3970]416 }
[3768]417
[24862]418 // Handle Text nodes
419 else if (nodeType == Node.TEXT_NODE)
420 {
421 String text = xmlNode.getNodeValue();
[23791]422
[24862]423 // Perform output escaping, if required
424 // Apache Commons replace method is far superior to String.replaceAll - very fast!
425 if (outputEscaping)
426 {
[3768]427
[24862]428 text = StringUtils.replace(text, "&", "&amp;");
429 text = StringUtils.replace(text, "<", "&lt;");
430 text = StringUtils.replace(text, ">", "&gt;");
431 text = StringUtils.replace(text, "'", "&apos;");
432 text = StringUtils.replace(text, "\"", "&quot;");
433 }
[23791]434
[24862]435 // Remove any control-C characters
436 text = StringUtils.replace(text, "" + (char) 3, "");
[3768]437
[24862]438 xmlRepresentation.append(text);
439 }
440
441 // Handle Processing Instruction nodes
442 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
443 {
444 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
445 {
446 outputEscaping = false;
447 }
448 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
449 {
450 outputEscaping = true;
451 }
452 else
453 {
454 logger.warn("Unhandled processing instruction " + nodeName);
455 }
456 }
457
458 else if (nodeType == Node.COMMENT_NODE)
459 {
460 String text = xmlNode.getNodeValue();
461 xmlRepresentation.append("<!-- ");
462 xmlRepresentation.append(text);
463 xmlRepresentation.append(" -->");
464 }
465
466 // A type of node that is not handled yet
467 else
468 {
469 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
470 }
471
472 return;
[3768]473 }
474
[24862]475 protected static String getNodeTypeString(short node_type)
476 {
[3908]477
[24862]478 String type = "";
479 switch (node_type)
480 {
481 case Node.ATTRIBUTE_NODE:
482 type = "ATTRIBUTE_NODE";
483 break;
484 case Node.CDATA_SECTION_NODE:
485 type = "CDATA_SECTION_NODE";
486 break;
487 case Node.COMMENT_NODE:
488 type = "COMMENT_NODE";
489 break;
490 case Node.DOCUMENT_FRAGMENT_NODE:
491 type = "DOCUMENT_FRAGMENT_NODE";
492 break;
493 case Node.DOCUMENT_NODE:
494 type = "DOCUMENT_NODE";
495 break;
496 case Node.DOCUMENT_TYPE_NODE:
497 type = "DOCUMENT_TYPE_NODE";
498 break;
499 case Node.ELEMENT_NODE:
500 type = "ELEMENT_NODE";
501 break;
502 case Node.ENTITY_NODE:
503 type = "ENTITY_NODE";
504 break;
505 case Node.ENTITY_REFERENCE_NODE:
506 type = "ENTITY_REFERENCE_NODE";
507 break;
508 case Node.NOTATION_NODE:
509 type = "NOTATION_NODE";
510 break;
511 case Node.PROCESSING_INSTRUCTION_NODE:
512 type = "PROCESSING_INSTRUCTION_NODE";
513 break;
514 case Node.TEXT_NODE:
515 type = "TEXT_NODE";
516 break;
517 default:
518 type = "UNKNOWN";
519 }
[3908]520
[24862]521 return type;
[3908]522 }
523
[18434]524 // returns null if there no error occurred during parsing, or else returns the error message
[24862]525 public String getParseErrorMessage()
526 {
527 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
[18434]528 return errorHandler.getErrorMessage();
529 }
[24862]530
[18434]531 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
532 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
533 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
534 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
[24862]535 static public class ParseErrorHandler implements ErrorHandler
536 {
[18434]537 protected String errorMessage = null;
[24862]538
[18434]539 // Receive notification of a recoverable error.
[24862]540 public void error(SAXParseException exception)
541 {
[18434]542 handleError("Error:\n", exception);
543 }
[24862]544
545 // Receive notification of a non-recoverable error.
546 public void fatalError(SAXParseException exception)
547 {
[18434]548 handleError("Fatal Error:\n", exception);
549 }
[24862]550
[18434]551 // Receive notification of a warning.
[24862]552 public void warning(SAXParseException exception)
553 {
[18434]554 handleError("Warning:\n", exception);
[24862]555 }
556
557 public String toString(SAXParseException e)
558 {
[18434]559 String msg = e.getMessage();
560 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
561 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
562 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
[24862]563
[18434]564 return msg;
565 }
[24862]566
[18434]567 // clears the errorPage variable after first call to this method
[24862]568 public String getErrorMessage()
569 {
[18434]570 String errMsg = this.errorMessage;
[24862]571 if (this.errorMessage != null)
572 {
[18434]573 this.errorMessage = null;
574 }
575 return errMsg;
576 }
[24862]577
[18434]578 // sets the errorMessage member variable to the data stored in the exception
579 // and writes the errorMessage to the logger and tomcat's System.err
[24862]580 protected void handleError(String errorType, SAXParseException exception)
581 {
582 this.errorMessage = errorType + toString(exception);
[18434]583 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
584 logger.error(this.errorMessage);
585 }
586 }
[3222]587}
Note: See TracBrowser for help on using the repository browser.