source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 25655

Last change on this file since 25655 was 25655, checked in by sjm84, 12 years ago

Added a version of getDOM that takes an encoding (not sure if this is even useful though)

  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.apache.xerces.parsers.DOMParser;
34import org.apache.xerces.dom.*; // for new Documents
35
36// other java classes
37import java.io.ByteArrayInputStream;
38import java.io.InputStream;
39import java.io.Reader;
40import java.io.InputStreamReader;
41import java.io.StringReader;
42import java.io.File;
43import java.io.FileInputStream;
44import java.io.FileReader;
45import java.util.regex.*;
46
47import org.apache.log4j.*;
48
49// Apache Commons
50import org.apache.commons.lang3.*;
51
52import java.util.*;
53import java.lang.reflect.*;
54
55/**
56 * XMLConverter - utility class for greenstone
57 *
58 * parses XML Strings into Documents, converts Nodes to Strings different
59 * parsers have different behaviour - can experiment in here now we only use
60 * xerces
61 *
62 */
63public class XMLConverter
64{
65
66 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
67
68 protected EntityResolver resolver = null;
69
70 /** xerces parser */
71 protected DOMParser parser = null;
72
73 private static boolean outputEscaping = true;
74
75 /** the no-args constructor */
76 public XMLConverter()
77 {
78 try
79 {
80 this.parser = new DOMParser();
81 this.parser.setFeature("http://xml.org/sax/features/validation", false);
82 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
83 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
84 // a performance test showed that having this on lead to increased
85 // memory use for small-medium docs, and not much gain for large
86 // docs.
87 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
88 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
89 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
90 // this errormessage can then be converted to xhtml and displayed in a browser.
91 this.parser.setErrorHandler(new ParseErrorHandler());
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 }
97 }
98
99 /** sets the entity resolver. pass in null to unset it */
100 public void setEntityResolver(EntityResolver er)
101 {
102 this.resolver = er;
103 this.parser.setEntityResolver(er);
104 }
105
106 /**
107 * Given a Node representing an Element or Document, will return the
108 * Element/docroot Element. Returns null if the Node was not an element.
109 */
110 public static Element nodeToElement(Node node)
111 {
112 if (node == null)
113 {
114 return null;
115 }
116 short nodeType = node.getNodeType();
117
118 if (nodeType == Node.DOCUMENT_NODE)
119 {
120 Document docNode = (Document) node;
121 return docNode.getDocumentElement();
122 }
123 else if (nodeType == Node.ELEMENT_NODE)
124 {
125 return (Element) node;
126 }
127 else
128 {
129 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
130 System.err.println(message);
131 logger.warn(message);
132 return null;
133 }
134 }
135
136 /** returns a DOM Document */
137 public Document getDOM(String in)
138 {
139
140 try
141 {
142 Reader reader = new StringReader(in);
143 InputSource xml_source = new InputSource(reader);
144
145 DOMParser parser = new DOMParser();
146 parser.setFeature("http://xml.org/sax/features/validation", false);
147 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
148 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
149 if(resolver != null)
150 {
151 parser.setEntityResolver(this.resolver);
152 }
153 parser.setErrorHandler(new ParseErrorHandler());
154 parser.parse(xml_source);
155
156 Document doc = parser.getDocument();
157
158 return doc;
159
160 }
161 catch (Exception e)
162 {
163 logger.error(e.getMessage());
164 }
165 return null;
166 }
167
168 /** returns a DOM Document */
169 public Document getDOM(String in, String encoding)
170 {
171 try
172 {
173 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
174 InputSource xml_source = new InputSource(reader);
175
176 DOMParser parser = new DOMParser();
177 parser.setFeature("http://xml.org/sax/features/validation", false);
178 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
179 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
180 if(resolver != null)
181 {
182 parser.setEntityResolver(this.resolver);
183 }
184 parser.setErrorHandler(new ParseErrorHandler());
185 parser.parse(xml_source);
186
187 Document doc = parser.getDocument();
188
189 return doc;
190
191 }
192 catch (Exception e)
193 {
194 logger.error(e.getMessage());
195 }
196 return null;
197 }
198
199 /** returns a DOM Document */
200 public Document getDOM(File in)
201 {
202 try
203 {
204 FileReader reader = new FileReader(in);
205 InputSource xml_source = new InputSource(reader);
206
207 DOMParser parser = new DOMParser();
208 parser.setFeature("http://xml.org/sax/features/validation", false);
209 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
210 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
211 if(resolver != null)
212 {
213 parser.setEntityResolver(this.resolver);
214 }
215 parser.setErrorHandler(new ParseErrorHandler());
216 parser.parse(xml_source);
217
218 Document doc = parser.getDocument();
219 return doc;
220
221 }
222 catch (Exception e)
223 {
224 logger.error(e.getMessage(), e);
225
226 }
227 return null;
228 }
229
230 /** returns a DOM document */
231 public Document getDOM(File in, String encoding)
232 {
233 try
234 {
235
236 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
237 InputSource xml_source = new InputSource(isr);
238
239 DOMParser parser = new DOMParser();
240 parser.setFeature("http://xml.org/sax/features/validation", false);
241 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
242 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
243 if(resolver != null)
244 {
245 parser.setEntityResolver(this.resolver);
246 }
247 parser.setErrorHandler(new ParseErrorHandler());
248 parser.parse(xml_source);
249
250 Document doc = parser.getDocument();
251
252 return doc;
253
254 }
255 catch (Exception e)
256 {
257 logger.error(e.getMessage());
258 }
259 return null;
260 }
261
262 /** creates a new empty DOM Document */
263 public static Document newDOM()
264 {
265 Document doc = new DocumentImpl();
266 return doc;
267 }
268
269 /**
270 * This method's parameters represent the parts of the Doctype of this
271 * Document that is to be created. For more info see
272 * http://xerces.apache.org
273 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
274 * .html#DocumentTypeImpl
275 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
276 *
277 * */
278 public static Document newDOM(String qualifiedName, String publicID, String systemID)
279 {
280 // create empty DOM document
281 DocumentImpl docImpl = new DocumentImpl();
282
283 // Need to use the document to create the docType for it
284 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
285
286 // Although we have created the docType using the document, we need to still
287 // put it into the empty document we just created
288 try
289 {
290 docImpl.appendChild(myDocType);
291 }
292 catch (Exception e)
293 {
294 System.out.println("Could not append docType because: " + e);
295 }
296
297 // return the document containing a DocType
298 return docImpl;
299 }
300
301 /** returns the Node as a String */
302 public static String getString(Node xmlNode)
303 {
304 outputEscaping = true;
305 StringBuffer xmlRepresentation = new StringBuffer();
306 getString(xmlNode, xmlRepresentation, 0, false);
307 return xmlRepresentation.toString();
308 }
309
310 /**
311 * returns the node as a nicely formatted String - this introduces extra
312 * text nodes if the String is read back in as a DOM, so should only be used
313 * for printing
314 */
315 public static String getPrettyString(Node xmlNode)
316 {
317
318 outputEscaping = true;
319 StringBuffer xmlRepresentation = new StringBuffer();
320 getString(xmlNode, xmlRepresentation, 0, true);
321 return xmlRepresentation.toString();
322 }
323
324 /*
325 * For the purposes of logger.debug statements, where this is called and
326 * hence outputted, returns an empty string if debugging is not enabled
327 */
328 public static String getPrettyStringLogger(Node xmlNode, Logger log)
329 {
330
331 if (log.isDebugEnabled())
332 return getPrettyString(xmlNode);
333
334 return "";
335
336 }
337
338 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
339 {
340
341 if (xmlNode == null)
342 {
343 xmlRepresentation.append("<null>");
344 return;
345 }
346
347 short nodeType = xmlNode.getNodeType();
348 String nodeName = xmlNode.getNodeName();
349
350 if (nodeType == Node.DOCUMENT_NODE)
351 {
352 Document xmlDocNode = (Document) xmlNode;
353
354 //if (xmlDocNode.getDoctype() == null) {
355 //System.err.println("Doctype is null.");
356 //}
357 //else {
358 if (xmlDocNode.getDoctype() != null)
359 {
360 DocumentType dt = xmlDocNode.getDoctype();
361
362 String name = dt.getName();
363 String pid = dt.getPublicId();
364 String sid = dt.getSystemId();
365
366 // Use previously assigned name, not dt.getName() again
367 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
368
369 xmlRepresentation.append(doctype_str);
370 }
371 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
372 return;
373 }
374 // Handle Element nodes
375 if (nodeType == Node.ELEMENT_NODE)
376 {
377 if (pretty)
378 {
379 xmlRepresentation.append("\n");
380 for (int i = 0; i < depth; i++)
381 {
382 xmlRepresentation.append(" ");
383 }
384 }
385
386 // Write opening tag
387 xmlRepresentation.append("<");
388 xmlRepresentation.append(nodeName);
389
390 // Write the node attributes
391 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
392 for (int i = 0; i < nodeAttributes.getLength(); i++)
393 {
394 Node attribute = nodeAttributes.item(i);
395 xmlRepresentation.append(" ");
396 xmlRepresentation.append(attribute.getNodeName());
397 xmlRepresentation.append("=\"");
398 xmlRepresentation.append(attribute.getNodeValue());
399 xmlRepresentation.append("\"");
400 }
401
402 // If the node has no children, close the opening tag and return
403 if (xmlNode.hasChildNodes() == false)
404 {
405 // This produces somewhat ugly output, but it is necessary to compensate
406 // for display bugs in Netscape. Firstly, the space is needed before the
407 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
408 // example). Also, a newline character would be expected after the tag,
409 // but this causes problems with the display of links (the link text
410 // will contain a newline character, which is displayed badly).
411 xmlRepresentation.append(" />");
412 return;
413 }
414
415 // Close the opening tag
416 xmlRepresentation.append(">");
417
418 // Apply recursively to the children of this node
419 // hack for nodes next to text nodes - dont make them pretty
420 // this is needed for text inside a <pre> element - any new lines
421 // or spaces around the span elements show up in the text
422 NodeList children = xmlNode.getChildNodes();
423 boolean do_pretty = pretty;
424 for (int i = 0; i < children.getLength(); i++)
425 {
426 if (children.item(i).getNodeType() == Node.TEXT_NODE)
427 {
428 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
429 }
430 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
431 }
432
433 // Write closing tag
434 if (pretty)
435 {
436 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
437 {
438 for (int i = 0; i < depth; i++)
439 xmlRepresentation.append(" ");
440 }
441 }
442 xmlRepresentation.append("</");
443 xmlRepresentation.append(nodeName);
444 xmlRepresentation.append(">");
445 if (pretty)
446 {
447 xmlRepresentation.append("\n");
448 }
449 }
450
451 // Handle Text nodes
452 else if (nodeType == Node.TEXT_NODE)
453 {
454 String text = xmlNode.getNodeValue();
455
456 // Perform output escaping, if required
457 // Apache Commons replace method is far superior to String.replaceAll - very fast!
458 if (outputEscaping)
459 {
460
461 text = StringUtils.replace(text, "&", "&amp;");
462 text = StringUtils.replace(text, "<", "&lt;");
463 text = StringUtils.replace(text, ">", "&gt;");
464 text = StringUtils.replace(text, "'", "&apos;");
465 text = StringUtils.replace(text, "\"", "&quot;");
466 }
467
468 // Remove any control-C characters
469 text = StringUtils.replace(text, "" + (char) 3, "");
470
471 xmlRepresentation.append(text);
472 }
473
474 // Handle Processing Instruction nodes
475 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
476 {
477 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
478 {
479 outputEscaping = false;
480 }
481 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
482 {
483 outputEscaping = true;
484 }
485 else
486 {
487 logger.warn("Unhandled processing instruction " + nodeName);
488 }
489 }
490
491 else if (nodeType == Node.COMMENT_NODE)
492 {
493 String text = xmlNode.getNodeValue();
494 xmlRepresentation.append("<!-- ");
495 xmlRepresentation.append(text);
496 xmlRepresentation.append(" -->");
497 }
498
499 // A type of node that is not handled yet
500 else
501 {
502 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
503 }
504
505 return;
506 }
507
508 protected static String getNodeTypeString(short node_type)
509 {
510
511 String type = "";
512 switch (node_type)
513 {
514 case Node.ATTRIBUTE_NODE:
515 type = "ATTRIBUTE_NODE";
516 break;
517 case Node.CDATA_SECTION_NODE:
518 type = "CDATA_SECTION_NODE";
519 break;
520 case Node.COMMENT_NODE:
521 type = "COMMENT_NODE";
522 break;
523 case Node.DOCUMENT_FRAGMENT_NODE:
524 type = "DOCUMENT_FRAGMENT_NODE";
525 break;
526 case Node.DOCUMENT_NODE:
527 type = "DOCUMENT_NODE";
528 break;
529 case Node.DOCUMENT_TYPE_NODE:
530 type = "DOCUMENT_TYPE_NODE";
531 break;
532 case Node.ELEMENT_NODE:
533 type = "ELEMENT_NODE";
534 break;
535 case Node.ENTITY_NODE:
536 type = "ENTITY_NODE";
537 break;
538 case Node.ENTITY_REFERENCE_NODE:
539 type = "ENTITY_REFERENCE_NODE";
540 break;
541 case Node.NOTATION_NODE:
542 type = "NOTATION_NODE";
543 break;
544 case Node.PROCESSING_INSTRUCTION_NODE:
545 type = "PROCESSING_INSTRUCTION_NODE";
546 break;
547 case Node.TEXT_NODE:
548 type = "TEXT_NODE";
549 break;
550 default:
551 type = "UNKNOWN";
552 }
553
554 return type;
555 }
556
557 // returns null if there no error occurred during parsing, or else returns the error message
558 public String getParseErrorMessage()
559 {
560 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
561 return errorHandler.getErrorMessage();
562 }
563
564 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
565 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
566 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
567 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
568 static public class ParseErrorHandler implements ErrorHandler
569 {
570 protected String errorMessage = null;
571
572 // Receive notification of a recoverable error.
573 public void error(SAXParseException exception)
574 {
575 handleError("Error:\n", exception);
576 }
577
578 // Receive notification of a non-recoverable error.
579 public void fatalError(SAXParseException exception)
580 {
581 handleError("Fatal Error:\n", exception);
582 }
583
584 // Receive notification of a warning.
585 public void warning(SAXParseException exception)
586 {
587 handleError("Warning:\n", exception);
588 }
589
590 public String toString(SAXParseException e)
591 {
592 String msg = e.getMessage();
593 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
594 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
595 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
596
597 return msg;
598 }
599
600 // clears the errorPage variable after first call to this method
601 public String getErrorMessage()
602 {
603 String errMsg = this.errorMessage;
604 if (this.errorMessage != null)
605 {
606 this.errorMessage = null;
607 }
608 return errMsg;
609 }
610
611 // sets the errorMessage member variable to the data stored in the exception
612 // and writes the errorMessage to the logger and tomcat's System.err
613 protected void handleError(String errorType, SAXParseException exception)
614 {
615 this.errorMessage = errorType + toString(exception);
616 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
617 logger.error(this.errorMessage);
618 }
619 }
620}
Note: See TracBrowser for help on using the repository browser.