source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 25128

Last change on this file since 25128 was 24863, checked in by sjm84, 12 years ago

XMLConverter no longer keeps one converter around to reuse, it makes a new one every time, this is to avoid an error where the parser is called whilst it is already parsing

  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.apache.xerces.parsers.DOMParser;
34import org.apache.xerces.dom.*; // for new Documents
35
36// other java classes
37import java.io.Reader;
38import java.io.InputStreamReader;
39import java.io.StringReader;
40import java.io.File;
41import java.io.FileInputStream;
42import java.io.FileReader;
43import java.util.regex.*;
44
45import org.apache.log4j.*;
46
47// Apache Commons
48import org.apache.commons.lang3.*;
49
50import java.util.*;
51import java.lang.reflect.*;
52
53/**
54 * XMLConverter - utility class for greenstone
55 *
56 * parses XML Strings into Documents, converts Nodes to Strings different
57 * parsers have different behaviour - can experiment in here now we only use
58 * xerces
59 *
60 */
61public class XMLConverter
62{
63
64 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
65
66 protected EntityResolver resolver = null;
67
68 /** xerces parser */
69 protected DOMParser parser = null;
70
71 private static boolean outputEscaping = true;
72
73 /** the no-args constructor */
74 public XMLConverter()
75 {
76 try
77 {
78 this.parser = new DOMParser();
79 this.parser.setFeature("http://xml.org/sax/features/validation", false);
80 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
81 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
82 // a performance test showed that having this on lead to increased
83 // memory use for small-medium docs, and not much gain for large
84 // docs.
85 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
86 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
87 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
88 // this errormessage can then be converted to xhtml and displayed in a browser.
89 this.parser.setErrorHandler(new ParseErrorHandler());
90 }
91 catch (Exception e)
92 {
93 logger.error(e.getMessage());
94 }
95 }
96
97 /** sets the entity resolver. pass in null to unset it */
98 public void setEntityResolver(EntityResolver er)
99 {
100 this.resolver = er;
101 this.parser.setEntityResolver(er);
102 }
103
104 /**
105 * Given a Node representing an Element or Document, will return the
106 * Element/docroot Element. Returns null if the Node was not an element.
107 */
108 public static Element nodeToElement(Node node)
109 {
110 if (node == null)
111 {
112 return null;
113 }
114 short nodeType = node.getNodeType();
115
116 if (nodeType == Node.DOCUMENT_NODE)
117 {
118 Document docNode = (Document) node;
119 return docNode.getDocumentElement();
120 }
121 else if (nodeType == Node.ELEMENT_NODE)
122 {
123 return (Element) node;
124 }
125 else
126 {
127 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
128 System.err.println(message);
129 logger.warn(message);
130 return null;
131 }
132 }
133
134 /** returns a DOM Document */
135 public Document getDOM(String in)
136 {
137
138 try
139 {
140 Reader reader = new StringReader(in);
141 InputSource xml_source = new InputSource(reader);
142
143 DOMParser parser = new DOMParser();
144 parser.setFeature("http://xml.org/sax/features/validation", false);
145 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
146 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
147 if(resolver != null)
148 {
149 parser.setEntityResolver(this.resolver);
150 }
151 parser.setErrorHandler(new ParseErrorHandler());
152 parser.parse(xml_source);
153
154 Document doc = parser.getDocument();
155
156 return doc;
157
158 }
159 catch (Exception e)
160 {
161 logger.error(e.getMessage());
162 }
163 return null;
164 }
165
166 /** returns a DOM Document */
167 public Document getDOM(File in)
168 {
169 try
170 {
171 FileReader reader = new FileReader(in);
172 InputSource xml_source = new InputSource(reader);
173
174 DOMParser parser = new DOMParser();
175 parser.setFeature("http://xml.org/sax/features/validation", false);
176 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
177 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
178 if(resolver != null)
179 {
180 parser.setEntityResolver(this.resolver);
181 }
182 parser.setErrorHandler(new ParseErrorHandler());
183 parser.parse(xml_source);
184
185 Document doc = parser.getDocument();
186 return doc;
187
188 }
189 catch (Exception e)
190 {
191 logger.error(e.getMessage(), e);
192
193 }
194 return null;
195 }
196
197 /** returns a DOM document */
198 public Document getDOM(File in, String encoding)
199 {
200 try
201 {
202
203 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
204 InputSource xml_source = new InputSource(isr);
205
206 DOMParser parser = new DOMParser();
207 parser.setFeature("http://xml.org/sax/features/validation", false);
208 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
209 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
210 if(resolver != null)
211 {
212 parser.setEntityResolver(this.resolver);
213 }
214 parser.setErrorHandler(new ParseErrorHandler());
215 parser.parse(xml_source);
216
217 Document doc = parser.getDocument();
218
219 return doc;
220
221 }
222 catch (Exception e)
223 {
224 logger.error(e.getMessage());
225 }
226 return null;
227 }
228
229 /** creates a new empty DOM Document */
230 public static Document newDOM()
231 {
232 Document doc = new DocumentImpl();
233 return doc;
234 }
235
236 /**
237 * This method's parameters represent the parts of the Doctype of this
238 * Document that is to be created. For more info see
239 * http://xerces.apache.org
240 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
241 * .html#DocumentTypeImpl
242 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
243 *
244 * */
245 public static Document newDOM(String qualifiedName, String publicID, String systemID)
246 {
247 // create empty DOM document
248 DocumentImpl docImpl = new DocumentImpl();
249
250 // Need to use the document to create the docType for it
251 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
252
253 // Although we have created the docType using the document, we need to still
254 // put it into the empty document we just created
255 try
256 {
257 docImpl.appendChild(myDocType);
258 }
259 catch (Exception e)
260 {
261 System.out.println("Could not append docType because: " + e);
262 }
263
264 // return the document containing a DocType
265 return docImpl;
266 }
267
268 /** returns the Node as a String */
269 public static String getString(Node xmlNode)
270 {
271 outputEscaping = true;
272 StringBuffer xmlRepresentation = new StringBuffer();
273 getString(xmlNode, xmlRepresentation, 0, false);
274 return xmlRepresentation.toString();
275 }
276
277 /**
278 * returns the node as a nicely formatted String - this introduces extra
279 * text nodes if the String is read back in as a DOM, so should only be used
280 * for printing
281 */
282 public static String getPrettyString(Node xmlNode)
283 {
284
285 outputEscaping = true;
286 StringBuffer xmlRepresentation = new StringBuffer();
287 getString(xmlNode, xmlRepresentation, 0, true);
288 return xmlRepresentation.toString();
289 }
290
291 /*
292 * For the purposes of logger.debug statements, where this is called and
293 * hence outputted, returns an empty string if debugging is not enabled
294 */
295 public static String getPrettyStringLogger(Node xmlNode, Logger log)
296 {
297
298 if (log.isDebugEnabled())
299 return getPrettyString(xmlNode);
300
301 return "";
302
303 }
304
305 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
306 {
307
308 if (xmlNode == null)
309 {
310 xmlRepresentation.append("<null>");
311 return;
312 }
313
314 short nodeType = xmlNode.getNodeType();
315 String nodeName = xmlNode.getNodeName();
316
317 if (nodeType == Node.DOCUMENT_NODE)
318 {
319 Document xmlDocNode = (Document) xmlNode;
320
321 //if (xmlDocNode.getDoctype() == null) {
322 //System.err.println("Doctype is null.");
323 //}
324 //else {
325 if (xmlDocNode.getDoctype() != null)
326 {
327 DocumentType dt = xmlDocNode.getDoctype();
328
329 String name = dt.getName();
330 String pid = dt.getPublicId();
331 String sid = dt.getSystemId();
332
333 // Use previously assigned name, not dt.getName() again
334 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
335
336 xmlRepresentation.append(doctype_str);
337 }
338 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
339 return;
340 }
341 // Handle Element nodes
342 if (nodeType == Node.ELEMENT_NODE)
343 {
344 if (pretty)
345 {
346 xmlRepresentation.append("\n");
347 for (int i = 0; i < depth; i++)
348 {
349 xmlRepresentation.append(" ");
350 }
351 }
352
353 // Write opening tag
354 xmlRepresentation.append("<");
355 xmlRepresentation.append(nodeName);
356
357 // Write the node attributes
358 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
359 for (int i = 0; i < nodeAttributes.getLength(); i++)
360 {
361 Node attribute = nodeAttributes.item(i);
362 xmlRepresentation.append(" ");
363 xmlRepresentation.append(attribute.getNodeName());
364 xmlRepresentation.append("=\"");
365 xmlRepresentation.append(attribute.getNodeValue());
366 xmlRepresentation.append("\"");
367 }
368
369 // If the node has no children, close the opening tag and return
370 if (xmlNode.hasChildNodes() == false)
371 {
372 // This produces somewhat ugly output, but it is necessary to compensate
373 // for display bugs in Netscape. Firstly, the space is needed before the
374 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
375 // example). Also, a newline character would be expected after the tag,
376 // but this causes problems with the display of links (the link text
377 // will contain a newline character, which is displayed badly).
378 xmlRepresentation.append(" />");
379 return;
380 }
381
382 // Close the opening tag
383 xmlRepresentation.append(">");
384
385 // Apply recursively to the children of this node
386 // hack for nodes next to text nodes - dont make them pretty
387 // this is needed for text inside a <pre> element - any new lines
388 // or spaces around the span elements show up in the text
389 NodeList children = xmlNode.getChildNodes();
390 boolean do_pretty = pretty;
391 for (int i = 0; i < children.getLength(); i++)
392 {
393 if (children.item(i).getNodeType() == Node.TEXT_NODE)
394 {
395 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
396 }
397 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
398 }
399
400 // Write closing tag
401 if (pretty)
402 {
403 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
404 {
405 for (int i = 0; i < depth; i++)
406 xmlRepresentation.append(" ");
407 }
408 }
409 xmlRepresentation.append("</");
410 xmlRepresentation.append(nodeName);
411 xmlRepresentation.append(">");
412 if (pretty)
413 {
414 xmlRepresentation.append("\n");
415 }
416 }
417
418 // Handle Text nodes
419 else if (nodeType == Node.TEXT_NODE)
420 {
421 String text = xmlNode.getNodeValue();
422
423 // Perform output escaping, if required
424 // Apache Commons replace method is far superior to String.replaceAll - very fast!
425 if (outputEscaping)
426 {
427
428 text = StringUtils.replace(text, "&", "&amp;");
429 text = StringUtils.replace(text, "<", "&lt;");
430 text = StringUtils.replace(text, ">", "&gt;");
431 text = StringUtils.replace(text, "'", "&apos;");
432 text = StringUtils.replace(text, "\"", "&quot;");
433 }
434
435 // Remove any control-C characters
436 text = StringUtils.replace(text, "" + (char) 3, "");
437
438 xmlRepresentation.append(text);
439 }
440
441 // Handle Processing Instruction nodes
442 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
443 {
444 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
445 {
446 outputEscaping = false;
447 }
448 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
449 {
450 outputEscaping = true;
451 }
452 else
453 {
454 logger.warn("Unhandled processing instruction " + nodeName);
455 }
456 }
457
458 else if (nodeType == Node.COMMENT_NODE)
459 {
460 String text = xmlNode.getNodeValue();
461 xmlRepresentation.append("<!-- ");
462 xmlRepresentation.append(text);
463 xmlRepresentation.append(" -->");
464 }
465
466 // A type of node that is not handled yet
467 else
468 {
469 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
470 }
471
472 return;
473 }
474
475 protected static String getNodeTypeString(short node_type)
476 {
477
478 String type = "";
479 switch (node_type)
480 {
481 case Node.ATTRIBUTE_NODE:
482 type = "ATTRIBUTE_NODE";
483 break;
484 case Node.CDATA_SECTION_NODE:
485 type = "CDATA_SECTION_NODE";
486 break;
487 case Node.COMMENT_NODE:
488 type = "COMMENT_NODE";
489 break;
490 case Node.DOCUMENT_FRAGMENT_NODE:
491 type = "DOCUMENT_FRAGMENT_NODE";
492 break;
493 case Node.DOCUMENT_NODE:
494 type = "DOCUMENT_NODE";
495 break;
496 case Node.DOCUMENT_TYPE_NODE:
497 type = "DOCUMENT_TYPE_NODE";
498 break;
499 case Node.ELEMENT_NODE:
500 type = "ELEMENT_NODE";
501 break;
502 case Node.ENTITY_NODE:
503 type = "ENTITY_NODE";
504 break;
505 case Node.ENTITY_REFERENCE_NODE:
506 type = "ENTITY_REFERENCE_NODE";
507 break;
508 case Node.NOTATION_NODE:
509 type = "NOTATION_NODE";
510 break;
511 case Node.PROCESSING_INSTRUCTION_NODE:
512 type = "PROCESSING_INSTRUCTION_NODE";
513 break;
514 case Node.TEXT_NODE:
515 type = "TEXT_NODE";
516 break;
517 default:
518 type = "UNKNOWN";
519 }
520
521 return type;
522 }
523
524 // returns null if there no error occurred during parsing, or else returns the error message
525 public String getParseErrorMessage()
526 {
527 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
528 return errorHandler.getErrorMessage();
529 }
530
531 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
532 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
533 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
534 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
535 static public class ParseErrorHandler implements ErrorHandler
536 {
537 protected String errorMessage = null;
538
539 // Receive notification of a recoverable error.
540 public void error(SAXParseException exception)
541 {
542 handleError("Error:\n", exception);
543 }
544
545 // Receive notification of a non-recoverable error.
546 public void fatalError(SAXParseException exception)
547 {
548 handleError("Fatal Error:\n", exception);
549 }
550
551 // Receive notification of a warning.
552 public void warning(SAXParseException exception)
553 {
554 handleError("Warning:\n", exception);
555 }
556
557 public String toString(SAXParseException e)
558 {
559 String msg = e.getMessage();
560 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
561 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
562 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
563
564 return msg;
565 }
566
567 // clears the errorPage variable after first call to this method
568 public String getErrorMessage()
569 {
570 String errMsg = this.errorMessage;
571 if (this.errorMessage != null)
572 {
573 this.errorMessage = null;
574 }
575 return errMsg;
576 }
577
578 // sets the errorMessage member variable to the data stored in the exception
579 // and writes the errorMessage to the logger and tomcat's System.err
580 protected void handleError(String errorType, SAXParseException exception)
581 {
582 this.errorMessage = errorType + toString(exception);
583 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
584 logger.error(this.errorMessage);
585 }
586 }
587}
Note: See TracBrowser for help on using the repository browser.