source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 34101

Last change on this file since 34101 was 34101, checked in by ak19, 4 years ago

Correcting syntax error and overlooked another instance of System.err.println() that should have been a logger.warn().

  • Property svn:keywords set to Author Date Id Revision
File size: 19.3 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.xml.sax.SAXNotRecognizedException;
34import org.xml.sax.SAXNotSupportedException;
35import org.apache.xerces.parsers.DOMParser;
36import org.apache.xerces.dom.DocumentImpl; // for new Documents
37import org.apache.xerces.dom.DocumentTypeImpl;
38
39// other java classes
40import java.io.BufferedWriter;
41import java.io.ByteArrayInputStream;
42import java.io.FileWriter;
43import java.io.InputStream;
44import java.io.Reader;
45import java.io.InputStreamReader;
46import java.io.StringReader;
47import java.io.File;
48import java.io.FileInputStream;
49import java.io.FileReader;
50import java.util.regex.*;
51
52import org.apache.log4j.*;
53
54// Apache Commons
55import org.apache.commons.lang3.*;
56
57import java.util.*;
58import java.lang.reflect.*;
59
60/**
61 * XMLConverter - utility class for greenstone
62 *
63 * generates new Documents
64 * parses XML Strings into Documents, converts Nodes to Strings
65 * different parsers have different behaviour - can experiment in here
66 * at the moment we only use xerces
67 * all xerces specific code is in here
68 */
69public class XMLConverter
70{
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74
75 public static final String XML_FILE_ENCODING="UTF-8";
76
77 static {
78 String fileEncodingProperty = System.getProperty("file.encoding");
79
80 // log a warning if the file.encoding Java property departs from XML_FILE_ENCODING,
81 // but always use the latter for reading in XML files in getDOM(File, ...) methods
82 if(fileEncodingProperty == null) {
83 logger.warn("file.encoding Java property was not set.");
84 logger.warn("Using XMLConverter.XML_FILE_ENCODING="+XML_FILE_ENCODING+"in XMLConverter.getDOM() calls.");
85 }
86 else if(!fileEncodingProperty.equals(XML_FILE_ENCODING)) {
87 logger.warn("file.encoding property passed in with JAVA_TOOLS_OPTIONS: "
88 + fileEncodingProperty);
89 logger.warn("does not match XMLConverter.XML_FILE_ENCODING: " + XML_FILE_ENCODING);
90 logger.warn("Using XMLConverter.XML_FILE_ENCODING in XMLConverter.getDOM() calls.");
91 }
92 };
93
94
95 /** the no-args constructor */
96 public XMLConverter()
97 {
98
99 }
100
101 /** returns a DOM Document
102 * Question: why is this not simply calling the getDOM(String in, String encoding) version?
103 * Answer: Because the string, being a Java String, is already fully formed in unicode,
104 * so it doesn't need to call that other version.
105 */
106 public static Document getDOM(String in)
107 {
108
109 try
110 {
111 Reader reader = new StringReader(in);
112 InputSource xml_source = new InputSource(reader);
113 Document doc = getDOM(xml_source, null);
114 reader.close();
115 return doc;
116
117 }
118 catch (Exception e)
119 {
120 logger.error(e.getMessage());
121 logger.error("Input string was:\n" + in);
122 e.printStackTrace();
123 }
124 return null;
125 }
126
127 /** returns a DOM Document */
128 public static Document getDOM(String in, String encoding)
129 {
130 try
131 {
132 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
133 InputSource xml_source = new InputSource(reader);
134 Document doc = getDOM(xml_source, null);
135 reader.close();
136 return doc;
137
138 }
139 catch (Exception e)
140 {
141 logger.error(e.getMessage());
142 logger.error("Input string was:\n" + in);
143 e.printStackTrace();
144 }
145 return null;
146 }
147
148 /** returns a DOM Document */
149 public static Document getDOM(File in) {
150
151 // now we're always going to explicitly use XML_FILE_ENCODING (UTF-8)
152 // as the encoding to read in a file unless otherwise specified.
153 return getDOM(in, XML_FILE_ENCODING, null);
154
155 /*
156 try
157 {
158 FileReader reader = new FileReader(in);
159 InputSource xml_source = new InputSource(reader);
160 Document doc = getDOM(xml_source, null);
161 reader.close();
162 return doc;
163
164 }
165 catch (Exception e)
166 {
167 logger.error(e.getMessage(), e);
168 logger.error("File was:\n" + in.getPath());
169 e.printStackTrace();
170
171 }
172 return null;
173 */
174 }
175
176 public static Document getDOM(File in, String encoding) {
177 return getDOM(in, encoding, null);
178 }
179
180 /** returns a DOM document */
181 public static Document getDOM(File in, String encoding, EntityResolver er) {
182
183 try {
184
185
186 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
187 InputSource xml_source = new InputSource(isr);
188 Document doc = getDOM(xml_source, er);
189 isr.close();
190 return doc;
191
192 }
193 catch (Exception e)
194 {
195 logger.error(e.getMessage());
196 logger.error("File was:\n" + in.getPath());
197 e.printStackTrace();
198 }
199 return null;
200 }
201
202 public static Document getDOM(File in, EntityResolver er) {
203 // now we're always going to explicitly use XML_FILE_ENCODING (UTF-8)
204 // as the encoding to read in a file unless otherwise specified.
205 return getDOM(in, XML_FILE_ENCODING, er);
206 /*
207 try {
208 InputSource xml_source = new InputSource(new FileInputStream(in));
209 Document doc = getDOM(xml_source, er);
210 return doc;
211 }
212 catch (Exception e)
213 {
214 logger.error(e.getMessage());
215 logger.error("File was:\n" + in.getPath());
216 e.printStackTrace();
217 }
218 return null;
219 */
220 }
221
222 public static Document getDOM(InputSource source, EntityResolver er) {
223
224 try {
225 DOMParser parser = new DOMParser();
226 parser.setFeature("http://xml.org/sax/features/validation", false);
227 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
228 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
229 // a performance test showed that having this on lead to increased
230 // memory use for small-medium docs, and not much gain for large
231 // docs.
232 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
233 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
234 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
235 parser.setErrorHandler(new ParseErrorHandler());
236 if (er != null) {
237 parser.setEntityResolver(er);
238 }
239 parser.parse(source);
240
241 Document doc = parser.getDocument();
242 return doc;
243
244 } catch (Exception e) {
245
246 logger.error(e.getMessage());
247 logger.error("InputSource:\n");
248 e.printStackTrace();
249 }
250 return null;
251
252 }
253
254 /** creates a new empty DOM Document */
255 public static Document newDOM()
256 {
257 Document doc = new DocumentImpl();
258 return doc;
259 }
260
261 /**
262 * This method's parameters represent the parts of the Doctype of this
263 * Document that is to be created. For more info see
264 * http://xerces.apache.org
265 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
266 * .html#DocumentTypeImpl
267 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
268 *
269 * */
270 public static Document newDOM(String qualifiedName, String publicID, String systemID)
271 {
272 // create empty DOM document
273 DocumentImpl docImpl = new DocumentImpl();
274
275 // Need to use the document to create the docType for it
276 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
277
278 // Although we have created the docType using the document, we need to still
279 // put it into the empty document we just created
280 try
281 {
282 docImpl.appendChild(myDocType);
283 }
284 catch (Exception e)
285 {
286 System.out.println("Could not append docType because: " + e);
287 }
288
289 // return the document containing a DocType
290 return docImpl;
291 }
292
293 /** returns the Node as a String */
294 public static String getString(Node xmlNode)
295 {
296 StringBuffer xmlRepresentation = new StringBuffer();
297 getString(xmlNode, xmlRepresentation, 0, false);
298 return xmlRepresentation.toString();
299 }
300
301 /**
302 * returns the node as a nicely formatted String - this introduces extra
303 * text nodes if the String is read back in as a DOM, so should only be used
304 * for printing
305 */
306 public static String getPrettyString(Node xmlNode)
307 {
308 StringBuffer xmlRepresentation = new StringBuffer();
309 getString(xmlNode, xmlRepresentation, 0, true);
310 return xmlRepresentation.toString();
311 }
312
313 /*
314 * For the purposes of logger.debug statements, where this is called and
315 * hence outputted, returns an empty string if debugging is not enabled
316 */
317 public static String getPrettyStringLogger(Node xmlNode, Logger log)
318 {
319
320 if (log.isDebugEnabled())
321 return getPrettyString(xmlNode);
322
323 return "";
324
325 }
326
327 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
328 {
329
330 if (xmlNode == null)
331 {
332 xmlRepresentation.append("<null>");
333 return;
334 }
335
336 short nodeType = xmlNode.getNodeType();
337 String nodeName = xmlNode.getNodeName();
338
339 if (nodeType == Node.DOCUMENT_NODE)
340 {
341 Document xmlDocNode = (Document) xmlNode;
342
343 //if (xmlDocNode.getDoctype() == null) {
344 //System.err.println("Doctype is null.");
345 //}
346 //else {
347 if (xmlDocNode.getDoctype() != null)
348 {
349 DocumentType dt = xmlDocNode.getDoctype();
350
351 String name = dt.getName();
352 String pid = dt.getPublicId();
353 String sid = dt.getSystemId();
354
355 // Use previously assigned name, not dt.getName() again
356 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
357
358 xmlRepresentation.append(doctype_str);
359 }
360 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
361 return;
362 }
363 // Handle Element nodes
364 if (nodeType == Node.ELEMENT_NODE)
365 {
366 if (pretty)
367 {
368 xmlRepresentation.append("\n");
369 for (int i = 0; i < depth; i++)
370 {
371 xmlRepresentation.append(" ");
372 }
373 }
374
375 // Write opening tag
376 xmlRepresentation.append("<");
377 xmlRepresentation.append(nodeName);
378
379 // Write the node attributes
380 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
381 for (int i = 0; i < nodeAttributes.getLength(); i++)
382 {
383 Node attribute = nodeAttributes.item(i);
384 xmlRepresentation.append(" ");
385 xmlRepresentation.append(attribute.getNodeName());
386 xmlRepresentation.append("=\"");
387 String attr_val = attribute.getNodeValue();
388
389 attr_val = attr_val.replaceAll("&","&amp;");
390 attr_val = attr_val.replaceAll("<","&lt;");
391 attr_val = attr_val.replaceAll(">","&gt;");
392 attr_val = attr_val.replaceAll("\"","&quot;");
393
394 // assume that any of the above chars that was already entity escaped
395 // was already correct => return back to how they were
396 attr_val = attr_val.replaceAll("&amp;amp;","&amp;");
397 attr_val = attr_val.replaceAll("&amp;lt;","&lt;");
398 attr_val = attr_val.replaceAll("&amp;gt;","&gt;");
399 attr_val = attr_val.replaceAll("&amp;quot;","&quot;");
400 attr_val = attr_val.replaceAll("&amp;apos;","&apos;");
401
402
403 xmlRepresentation.append(attr_val);
404 xmlRepresentation.append("\"");
405 }
406
407 // If the node has no children, close the opening tag and return
408 if (xmlNode.hasChildNodes() == false)
409 {
410 // This produces somewhat ugly output, but it is necessary to compensate
411 // for display bugs in Netscape. Firstly, the space is needed before the
412 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
413 // example). Also, a newline character would be expected after the tag,
414 // but this causes problems with the display of links (the link text
415 // will contain a newline character, which is displayed badly).
416 xmlRepresentation.append(" />");
417 return;
418 }
419
420 // Close the opening tag
421 xmlRepresentation.append(">");
422
423 // Process the children. We process text nodes here, but recursively process other nodes.
424 // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
425 // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
426 // also if these are inside a pre tag then the space shows up in the page.
427
428 NodeList children = xmlNode.getChildNodes();
429 boolean do_pretty = pretty;
430 boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
431 for (int i = 0; i < children.getLength(); i++)
432 {
433 Node child = children.item(i);
434 short child_type = child.getNodeType();
435 if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
436 if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
437 output_escaping = false;
438 }
439 else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
440 output_escaping = true;
441 }
442 else {
443 logger.warn("Unhandled processing instruction " + child.getNodeName());
444 }
445 }
446 else if (child_type == Node.TEXT_NODE) {
447 do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
448 // output the text
449 String text = child.getNodeValue();
450
451 // Perform output escaping, if required
452 // Apache Commons replace method is far superior to String.replaceAll - very fast!
453 if (output_escaping) {
454 text = StringUtils.replace(text, "&", "&amp;");
455 text = StringUtils.replace(text, "<", "&lt;");
456 text = StringUtils.replace(text, ">", "&gt;");
457 text = StringUtils.replace(text, "'", "&apos;");
458 text = StringUtils.replace(text, "\"", "&quot;");
459 }
460 // Remove any control-C characters
461 text = StringUtils.replace(text, "" + (char) 3, "");
462
463 xmlRepresentation.append(text);
464
465 }
466 else {
467 // recursively call getString
468 getString(child, xmlRepresentation, depth + 1, do_pretty);
469 }
470 } // foreach child of the element
471
472 // Write closing tag
473 if (pretty)
474 {
475 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
476 {
477 for (int i = 0; i < depth; i++)
478 xmlRepresentation.append(" ");
479 }
480 }
481 xmlRepresentation.append("</");
482 xmlRepresentation.append(nodeName);
483 xmlRepresentation.append(">");
484 if (pretty)
485 {
486 xmlRepresentation.append("\n");
487 }
488 } // ELEMENT_NODE
489
490 else if (nodeType == Node.COMMENT_NODE)
491 {
492 String text = xmlNode.getNodeValue();
493 xmlRepresentation.append("<!-- ");
494 xmlRepresentation.append(text);
495 xmlRepresentation.append(" -->");
496 }
497
498 // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
499 // A type of node that is not handled yet
500 else
501 {
502 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
503 }
504
505 return;
506 }
507
508 protected static String getNodeTypeString(short node_type)
509 {
510
511 String type = "";
512 switch (node_type)
513 {
514 case Node.ATTRIBUTE_NODE:
515 type = "ATTRIBUTE_NODE";
516 break;
517 case Node.CDATA_SECTION_NODE:
518 type = "CDATA_SECTION_NODE";
519 break;
520 case Node.COMMENT_NODE:
521 type = "COMMENT_NODE";
522 break;
523 case Node.DOCUMENT_FRAGMENT_NODE:
524 type = "DOCUMENT_FRAGMENT_NODE";
525 break;
526 case Node.DOCUMENT_NODE:
527 type = "DOCUMENT_NODE";
528 break;
529 case Node.DOCUMENT_TYPE_NODE:
530 type = "DOCUMENT_TYPE_NODE";
531 break;
532 case Node.ELEMENT_NODE:
533 type = "ELEMENT_NODE";
534 break;
535 case Node.ENTITY_NODE:
536 type = "ENTITY_NODE";
537 break;
538 case Node.ENTITY_REFERENCE_NODE:
539 type = "ENTITY_REFERENCE_NODE";
540 break;
541 case Node.NOTATION_NODE:
542 type = "NOTATION_NODE";
543 break;
544 case Node.PROCESSING_INSTRUCTION_NODE:
545 type = "PROCESSING_INSTRUCTION_NODE";
546 break;
547 case Node.TEXT_NODE:
548 type = "TEXT_NODE";
549 break;
550 default:
551 type = "UNKNOWN";
552 }
553
554 return type;
555 }
556
557 // returns null if there no error occurred during parsing, or else returns the error message
558
559 // public String getParseErrorMessage()
560 // {
561 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
562 // return errorHandler.getErrorMessage();
563 // }
564
565 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
566 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
567 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
568 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
569 static public class ParseErrorHandler implements ErrorHandler
570 {
571 protected String errorMessage = null;
572
573 // Receive notification of a recoverable error.
574 public void error(SAXParseException exception)
575 {
576 handleError("Error:\n", exception);
577 }
578
579 // Receive notification of a non-recoverable error.
580 public void fatalError(SAXParseException exception)
581 {
582 handleError("Fatal Error:\n", exception);
583 }
584
585 // Receive notification of a warning.
586 public void warning(SAXParseException exception)
587 {
588 handleError("Warning:\n", exception);
589 }
590
591 public String toString(SAXParseException e)
592 {
593 String msg = e.getMessage();
594 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
595 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
596 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
597
598 return msg;
599 }
600
601 // clears the errorPage variable after first call to this method
602 public String getErrorMessage()
603 {
604 String errMsg = this.errorMessage;
605 if (this.errorMessage != null)
606 {
607 this.errorMessage = null;
608 }
609 return errMsg;
610 }
611
612 // sets the errorMessage member variable to the data stored in the exception
613 // and writes the errorMessage to the logger and tomcat's System.err
614 protected void handleError(String errorType, SAXParseException exception)
615 {
616 this.errorMessage = errorType + toString(exception);
617 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
618 logger.error(this.errorMessage);
619 }
620 }
621
622 public static boolean writeDOM(Element elem, File file) {
623
624 BufferedWriter writer = null;
625 boolean success = false;
626 try {
627 String xml_string = getString(elem);
628 // need createNewFile???
629 writer = new BufferedWriter(new FileWriter(file));
630 writer.write(xml_string);
631 success = true;
632 }
633
634 catch (Exception e) {
635 logger.error(e.getMessage());
636 success = false;
637 }
638 finally {
639 try {
640 if (writer != null) {
641 writer.close();
642 }
643 } catch(Exception e) {
644 logger.error("couldn't close the file"+e.getMessage());
645 }
646 }
647 return success;
648 }
649}
Note: See TracBrowser for help on using the repository browser.