source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28849

Last change on this file since 28849 was 28849, checked in by kjdon, 10 years ago

added writeDOM method

  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.apache.xerces.parsers.DOMParser;
34import org.apache.xerces.dom.*; // for new Documents
35
36// other java classes
37import java.io.BufferedWriter;
38import java.io.ByteArrayInputStream;
39import java.io.FileWriter;
40import java.io.InputStream;
41import java.io.Reader;
42import java.io.InputStreamReader;
43import java.io.StringReader;
44import java.io.File;
45import java.io.FileInputStream;
46import java.io.FileReader;
47import java.util.regex.*;
48
49import org.apache.log4j.*;
50
51// Apache Commons
52import org.apache.commons.lang3.*;
53
54import java.util.*;
55import java.lang.reflect.*;
56
57/**
58 * XMLConverter - utility class for greenstone
59 *
60 * parses XML Strings into Documents, converts Nodes to Strings different
61 * parsers have different behaviour - can experiment in here now we only use
62 * xerces
63 *
64 */
65public class XMLConverter
66{
67
68 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
69
70 protected EntityResolver resolver = null;
71
72 /** xerces parser */
73 protected DOMParser parser = null;
74
75 private static boolean outputEscaping = true;
76
77 /** the no-args constructor */
78 public XMLConverter()
79 {
80 try
81 {
82 this.parser = new DOMParser();
83 this.parser.setFeature("http://xml.org/sax/features/validation", false);
84 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
85 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
86 // a performance test showed that having this on lead to increased
87 // memory use for small-medium docs, and not much gain for large
88 // docs.
89 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
90 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
91 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
92 // this errormessage can then be converted to xhtml and displayed in a browser.
93 this.parser.setErrorHandler(new ParseErrorHandler());
94 }
95 catch (Exception e)
96 {
97 logger.error(e.getMessage());
98 }
99 }
100
101 /** sets the entity resolver. pass in null to unset it */
102 public void setEntityResolver(EntityResolver er)
103 {
104 this.resolver = er;
105 this.parser.setEntityResolver(er);
106 }
107
108 /**
109 * Given a Node representing an Element or Document, will return the
110 * Element/docroot Element. Returns null if the Node was not an element.
111 */
112 public static Element nodeToElement(Node node)
113 {
114 if (node == null)
115 {
116 return null;
117 }
118 short nodeType = node.getNodeType();
119
120 if (nodeType == Node.DOCUMENT_NODE)
121 {
122 Document docNode = (Document) node;
123 return docNode.getDocumentElement();
124 }
125 else if (nodeType == Node.ELEMENT_NODE)
126 {
127 return (Element) node;
128 }
129 else
130 {
131 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
132 System.err.println(message);
133 logger.warn(message);
134 return null;
135 }
136 }
137
138 /** returns a DOM Document */
139 public Document getDOM(String in)
140 {
141
142 try
143 {
144 Reader reader = new StringReader(in);
145 InputSource xml_source = new InputSource(reader);
146
147 DOMParser parser = new DOMParser();
148 parser.setFeature("http://xml.org/sax/features/validation", false);
149 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
150 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
151 if (resolver != null)
152 {
153 parser.setEntityResolver(this.resolver);
154 }
155 parser.setErrorHandler(new ParseErrorHandler());
156 parser.parse(xml_source);
157
158 Document doc = parser.getDocument();
159
160 reader.close();
161 return doc;
162
163 }
164 catch (Exception e)
165 {
166 logger.error(e.getMessage());
167 }
168 return null;
169 }
170
171 /** returns a DOM Document */
172 public Document getDOM(String in, String encoding)
173 {
174 try
175 {
176 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
177 InputSource xml_source = new InputSource(reader);
178
179 DOMParser parser = new DOMParser();
180 parser.setFeature("http://xml.org/sax/features/validation", false);
181 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
182 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
183 if (resolver != null)
184 {
185 parser.setEntityResolver(this.resolver);
186 }
187 parser.setErrorHandler(new ParseErrorHandler());
188 parser.parse(xml_source);
189
190 Document doc = parser.getDocument();
191
192 reader.close();
193 return doc;
194
195 }
196 catch (Exception e)
197 {
198 logger.error(e.getMessage());
199 }
200 return null;
201 }
202
203 /** returns a DOM Document */
204 public Document getDOM(File in)
205 {
206 try
207 {
208 FileReader reader = new FileReader(in);
209 InputSource xml_source = new InputSource(reader);
210
211 DOMParser parser = new DOMParser();
212 parser.setFeature("http://xml.org/sax/features/validation", false);
213 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
214 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
215 if (resolver != null)
216 {
217 parser.setEntityResolver(this.resolver);
218 }
219 parser.setErrorHandler(new ParseErrorHandler());
220 parser.parse(xml_source);
221
222 Document doc = parser.getDocument();
223
224 reader.close();
225 return doc;
226
227 }
228 catch (Exception e)
229 {
230 logger.error(e.getMessage(), e);
231
232 }
233 return null;
234 }
235
236 /** returns a DOM document */
237 public Document getDOM(File in, String encoding)
238 {
239 try
240 {
241
242 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
243 InputSource xml_source = new InputSource(isr);
244
245 DOMParser parser = new DOMParser();
246 parser.setFeature("http://xml.org/sax/features/validation", false);
247 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
248 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
249 if (resolver != null)
250 {
251 parser.setEntityResolver(this.resolver);
252 }
253 parser.setErrorHandler(new ParseErrorHandler());
254 parser.parse(xml_source);
255
256 Document doc = parser.getDocument();
257
258 isr.close();
259 return doc;
260
261 }
262 catch (Exception e)
263 {
264 logger.error(e.getMessage());
265 }
266 return null;
267 }
268
269 /** creates a new empty DOM Document */
270 public Document newDOM()
271 {
272 Document doc = new DocumentImpl();
273 return doc;
274 }
275
276 /**
277 * This method's parameters represent the parts of the Doctype of this
278 * Document that is to be created. For more info see
279 * http://xerces.apache.org
280 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
281 * .html#DocumentTypeImpl
282 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
283 *
284 * */
285 public static Document newDOM(String qualifiedName, String publicID, String systemID)
286 {
287 // create empty DOM document
288 DocumentImpl docImpl = new DocumentImpl();
289
290 // Need to use the document to create the docType for it
291 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
292
293 // Although we have created the docType using the document, we need to still
294 // put it into the empty document we just created
295 try
296 {
297 docImpl.appendChild(myDocType);
298 }
299 catch (Exception e)
300 {
301 System.out.println("Could not append docType because: " + e);
302 }
303
304 // return the document containing a DocType
305 return docImpl;
306 }
307
308 /** returns the Node as a String */
309 public static String getString(Node xmlNode)
310 {
311 outputEscaping = true;
312 StringBuffer xmlRepresentation = new StringBuffer();
313 getString(xmlNode, xmlRepresentation, 0, false);
314 return xmlRepresentation.toString();
315 }
316
317 /**
318 * returns the node as a nicely formatted String - this introduces extra
319 * text nodes if the String is read back in as a DOM, so should only be used
320 * for printing
321 */
322 public static String getPrettyString(Node xmlNode)
323 {
324
325 outputEscaping = true;
326 StringBuffer xmlRepresentation = new StringBuffer();
327 getString(xmlNode, xmlRepresentation, 0, true);
328 return xmlRepresentation.toString();
329 }
330
331 /*
332 * For the purposes of logger.debug statements, where this is called and
333 * hence outputted, returns an empty string if debugging is not enabled
334 */
335 public static String getPrettyStringLogger(Node xmlNode, Logger log)
336 {
337
338 if (log.isDebugEnabled())
339 return getPrettyString(xmlNode);
340
341 return "";
342
343 }
344
345 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
346 {
347
348 if (xmlNode == null)
349 {
350 xmlRepresentation.append("<null>");
351 return;
352 }
353
354 short nodeType = xmlNode.getNodeType();
355 String nodeName = xmlNode.getNodeName();
356
357 if (nodeType == Node.DOCUMENT_NODE)
358 {
359 Document xmlDocNode = (Document) xmlNode;
360
361 //if (xmlDocNode.getDoctype() == null) {
362 //System.err.println("Doctype is null.");
363 //}
364 //else {
365 if (xmlDocNode.getDoctype() != null)
366 {
367 DocumentType dt = xmlDocNode.getDoctype();
368
369 String name = dt.getName();
370 String pid = dt.getPublicId();
371 String sid = dt.getSystemId();
372
373 // Use previously assigned name, not dt.getName() again
374 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
375
376 xmlRepresentation.append(doctype_str);
377 }
378 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
379 return;
380 }
381 // Handle Element nodes
382 if (nodeType == Node.ELEMENT_NODE)
383 {
384 if (pretty)
385 {
386 xmlRepresentation.append("\n");
387 for (int i = 0; i < depth; i++)
388 {
389 xmlRepresentation.append(" ");
390 }
391 }
392
393 // Write opening tag
394 xmlRepresentation.append("<");
395 xmlRepresentation.append(nodeName);
396
397 // Write the node attributes
398 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
399 for (int i = 0; i < nodeAttributes.getLength(); i++)
400 {
401 Node attribute = nodeAttributes.item(i);
402 xmlRepresentation.append(" ");
403 xmlRepresentation.append(attribute.getNodeName());
404 xmlRepresentation.append("=\"");
405 xmlRepresentation.append(attribute.getNodeValue());
406 xmlRepresentation.append("\"");
407 }
408
409 // If the node has no children, close the opening tag and return
410 if (xmlNode.hasChildNodes() == false)
411 {
412 // This produces somewhat ugly output, but it is necessary to compensate
413 // for display bugs in Netscape. Firstly, the space is needed before the
414 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
415 // example). Also, a newline character would be expected after the tag,
416 // but this causes problems with the display of links (the link text
417 // will contain a newline character, which is displayed badly).
418 xmlRepresentation.append(" />");
419 return;
420 }
421
422 // Close the opening tag
423 xmlRepresentation.append(">");
424
425 // Apply recursively to the children of this node
426 // hack for nodes next to text nodes - dont make them pretty
427 // this is needed for text inside a <pre> element - any new lines
428 // or spaces around the span elements show up in the text
429 NodeList children = xmlNode.getChildNodes();
430 boolean do_pretty = pretty;
431 for (int i = 0; i < children.getLength(); i++)
432 {
433 if (children.item(i).getNodeType() == Node.TEXT_NODE)
434 {
435 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
436 }
437 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
438 }
439
440 // Write closing tag
441 if (pretty)
442 {
443 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
444 {
445 for (int i = 0; i < depth; i++)
446 xmlRepresentation.append(" ");
447 }
448 }
449 xmlRepresentation.append("</");
450 xmlRepresentation.append(nodeName);
451 xmlRepresentation.append(">");
452 if (pretty)
453 {
454 xmlRepresentation.append("\n");
455 }
456 }
457
458 // Handle Text nodes
459 else if (nodeType == Node.TEXT_NODE)
460 {
461 String text = xmlNode.getNodeValue();
462
463 // Perform output escaping, if required
464 // Apache Commons replace method is far superior to String.replaceAll - very fast!
465 if (outputEscaping)
466 {
467
468 text = StringUtils.replace(text, "&", "&amp;");
469 text = StringUtils.replace(text, "<", "&lt;");
470 text = StringUtils.replace(text, ">", "&gt;");
471 text = StringUtils.replace(text, "'", "&apos;");
472 text = StringUtils.replace(text, "\"", "&quot;");
473 }
474
475 // Remove any control-C characters
476 text = StringUtils.replace(text, "" + (char) 3, "");
477
478 xmlRepresentation.append(text);
479 }
480
481 // Handle Processing Instruction nodes
482 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
483 {
484 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
485 {
486 outputEscaping = false;
487 }
488 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
489 {
490 outputEscaping = true;
491 }
492 else
493 {
494 logger.warn("Unhandled processing instruction " + nodeName);
495 }
496 }
497
498 else if (nodeType == Node.COMMENT_NODE)
499 {
500 String text = xmlNode.getNodeValue();
501 xmlRepresentation.append("<!-- ");
502 xmlRepresentation.append(text);
503 xmlRepresentation.append(" -->");
504 }
505
506 // A type of node that is not handled yet
507 else
508 {
509 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
510 }
511
512 return;
513 }
514
515 protected static String getNodeTypeString(short node_type)
516 {
517
518 String type = "";
519 switch (node_type)
520 {
521 case Node.ATTRIBUTE_NODE:
522 type = "ATTRIBUTE_NODE";
523 break;
524 case Node.CDATA_SECTION_NODE:
525 type = "CDATA_SECTION_NODE";
526 break;
527 case Node.COMMENT_NODE:
528 type = "COMMENT_NODE";
529 break;
530 case Node.DOCUMENT_FRAGMENT_NODE:
531 type = "DOCUMENT_FRAGMENT_NODE";
532 break;
533 case Node.DOCUMENT_NODE:
534 type = "DOCUMENT_NODE";
535 break;
536 case Node.DOCUMENT_TYPE_NODE:
537 type = "DOCUMENT_TYPE_NODE";
538 break;
539 case Node.ELEMENT_NODE:
540 type = "ELEMENT_NODE";
541 break;
542 case Node.ENTITY_NODE:
543 type = "ENTITY_NODE";
544 break;
545 case Node.ENTITY_REFERENCE_NODE:
546 type = "ENTITY_REFERENCE_NODE";
547 break;
548 case Node.NOTATION_NODE:
549 type = "NOTATION_NODE";
550 break;
551 case Node.PROCESSING_INSTRUCTION_NODE:
552 type = "PROCESSING_INSTRUCTION_NODE";
553 break;
554 case Node.TEXT_NODE:
555 type = "TEXT_NODE";
556 break;
557 default:
558 type = "UNKNOWN";
559 }
560
561 return type;
562 }
563
564 // returns null if there no error occurred during parsing, or else returns the error message
565 public String getParseErrorMessage()
566 {
567 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
568 return errorHandler.getErrorMessage();
569 }
570
571 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
572 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
573 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
574 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
575 static public class ParseErrorHandler implements ErrorHandler
576 {
577 protected String errorMessage = null;
578
579 // Receive notification of a recoverable error.
580 public void error(SAXParseException exception)
581 {
582 handleError("Error:\n", exception);
583 }
584
585 // Receive notification of a non-recoverable error.
586 public void fatalError(SAXParseException exception)
587 {
588 handleError("Fatal Error:\n", exception);
589 }
590
591 // Receive notification of a warning.
592 public void warning(SAXParseException exception)
593 {
594 handleError("Warning:\n", exception);
595 }
596
597 public String toString(SAXParseException e)
598 {
599 String msg = e.getMessage();
600 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
601 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
602 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
603
604 return msg;
605 }
606
607 // clears the errorPage variable after first call to this method
608 public String getErrorMessage()
609 {
610 String errMsg = this.errorMessage;
611 if (this.errorMessage != null)
612 {
613 this.errorMessage = null;
614 }
615 return errMsg;
616 }
617
618 // sets the errorMessage member variable to the data stored in the exception
619 // and writes the errorMessage to the logger and tomcat's System.err
620 protected void handleError(String errorType, SAXParseException exception)
621 {
622 this.errorMessage = errorType + toString(exception);
623 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
624 logger.error(this.errorMessage);
625 }
626 }
627
628 public boolean writeDOM(Element elem, File file) {
629
630 BufferedWriter writer = null;
631 boolean success = false;
632 try {
633 String xml_string = getString(elem);
634 // need createNewFile???
635 writer = new BufferedWriter(new FileWriter(file));
636 writer.write(xml_string);
637 success = true;
638 }
639
640 catch (Exception e) {
641 logger.error(e.getMessage());
642 success = false;
643 }
644 finally {
645 try {
646 if (writer != null) {
647 writer.close();
648 }
649 } catch(Exception e) {
650 logger.error("couldn't close the file"+e.getMessage());
651 }
652 }
653 return success;
654 }
655}
Note: See TracBrowser for help on using the repository browser.