source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 34112

Last change on this file since 34112 was 34112, checked in by ak19, 4 years ago

GS3 source code seems to already use FileInputStream with UTF-8 encoding where XML files are concerned. Refactored XML_FILE_ENCODING into GSConstants class so both XMLConverter and now XMLTransformer can use it.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.8 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// import constant GSConstants.XML_FILE_ENCODING, so this file can use it as XML_FILE_ENCODING directly
22//import static org.greenstone.gsdl3.util.GSConstants.XML_FILE_ENCODING;
23
24// XML classes
25import org.w3c.dom.DOMImplementation;
26import org.w3c.dom.Document;
27import org.w3c.dom.DocumentType;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.NodeList;
31import org.w3c.dom.NamedNodeMap;
32import org.xml.sax.InputSource;
33import org.xml.sax.EntityResolver;
34import org.xml.sax.ErrorHandler;
35import org.xml.sax.SAXParseException;
36import org.xml.sax.SAXNotRecognizedException;
37import org.xml.sax.SAXNotSupportedException;
38import org.apache.xerces.parsers.DOMParser;
39import org.apache.xerces.dom.DocumentImpl; // for new Documents
40import org.apache.xerces.dom.DocumentTypeImpl;
41
42// other java classes
43import java.io.BufferedWriter;
44import java.io.ByteArrayInputStream;
45import java.io.FileWriter;
46import java.io.InputStream;
47import java.io.Reader;
48import java.io.InputStreamReader;
49import java.io.StringReader;
50import java.io.File;
51import java.io.FileInputStream;
52import java.io.FileReader;
53import java.util.regex.*;
54
55import org.apache.log4j.*;
56
57// Apache Commons
58import org.apache.commons.lang3.*;
59
60import java.util.*;
61import java.lang.reflect.*;
62
63/**
64 * XMLConverter - utility class for greenstone
65 *
66 * generates new Documents
67 * parses XML Strings into Documents, converts Nodes to Strings
68 * different parsers have different behaviour - can experiment in here
69 * at the moment we only use xerces
70 * all xerces specific code is in here
71 */
72public class XMLConverter
73{
74
75 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
76
77
78
79
80 static {
81 String fileEncodingProperty = System.getProperty("file.encoding");
82
83 // log a warning if the file.encoding Java property departs from GSConstants.XML_FILE_ENCODING,
84 // but always use the latter for reading in XML files in getDOM(File, ...) methods
85
86 // Stopped passing around JAVA_TOOL_OPTIONS="-Dfile.encoding=UTF-8" again.
87 // So this is likely to be null by default. Don't want to see the warnings about it being null.
88 /*if(fileEncodingProperty == null) {
89 logger.warn("file.encoding Java property was not set.");
90 logger.warn("Using XMLConverter.GSConstants.XML_FILE_ENCODING="+GSConstants.XML_FILE_ENCODING+"in XMLConverter.getDOM() calls.");
91 }
92 else if(!fileEncodingProperty.equals(GSConstants.XML_FILE_ENCODING)) {
93 */
94 if(fileEncodingProperty != null && !fileEncodingProperty.equals(GSConstants.XML_FILE_ENCODING)) {
95 logger.warn("file.encoding property was passed in with JAVA_TOOLS_OPTIONS as: "
96 + fileEncodingProperty);
97 logger.warn("This does not match XMLConverter.GSConstants.XML_FILE_ENCODING: " + GSConstants.XML_FILE_ENCODING);
98 logger.warn("Using XMLConverter.GSConstants.XML_FILE_ENCODING in XMLConverter.getDOM() and XMLTransformer calls.");
99 }
100 };
101
102
103 /** the no-args constructor */
104 public XMLConverter()
105 {
106
107 }
108
109 /** returns a DOM Document
110 * Question: why is this not simply calling the getDOM(String in, String encoding) version?
111 * Answer: Because the string, being a Java String, is already fully formed in unicode,
112 * so it doesn't need to call that other version.
113 */
114 public static Document getDOM(String in)
115 {
116
117 try
118 {
119 Reader reader = new StringReader(in);
120 InputSource xml_source = new InputSource(reader);
121 Document doc = getDOM(xml_source, null);
122 reader.close();
123 return doc;
124
125 }
126 catch (Exception e)
127 {
128 logger.error(e.getMessage());
129 logger.error("Input string was:\n" + in);
130 e.printStackTrace();
131 }
132 return null;
133 }
134
135 /** returns a DOM Document */
136 public static Document getDOM(String in, String encoding)
137 {
138 try
139 {
140 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
141 InputSource xml_source = new InputSource(reader);
142 Document doc = getDOM(xml_source, null);
143 reader.close();
144 return doc;
145
146 }
147 catch (Exception e)
148 {
149 logger.error(e.getMessage());
150 logger.error("Input string was:\n" + in);
151 e.printStackTrace();
152 }
153 return null;
154 }
155
156 /** returns a DOM Document */
157 public static Document getDOM(File in) {
158
159 // now we're always going to explicitly use GSConstants.XML_FILE_ENCODING (UTF-8)
160 // as the encoding to read in a file unless otherwise specified.
161 return getDOM(in, GSConstants.XML_FILE_ENCODING, null);
162
163 /*
164 try
165 {
166 FileReader reader = new FileReader(in);
167 InputSource xml_source = new InputSource(reader);
168 Document doc = getDOM(xml_source, null);
169 reader.close();
170 return doc;
171
172 }
173 catch (Exception e)
174 {
175 logger.error(e.getMessage(), e);
176 logger.error("File was:\n" + in.getPath());
177 e.printStackTrace();
178
179 }
180 return null;
181 */
182 }
183
184 public static Document getDOM(File in, String encoding) {
185 return getDOM(in, encoding, null);
186 }
187
188 /** returns a DOM document */
189 public static Document getDOM(File in, String encoding, EntityResolver er) {
190
191 try {
192
193
194 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
195 InputSource xml_source = new InputSource(isr);
196 Document doc = getDOM(xml_source, er);
197 isr.close();
198 return doc;
199
200 }
201 catch (Exception e)
202 {
203 logger.error(e.getMessage());
204 logger.error("File was:\n" + in.getPath());
205 e.printStackTrace();
206 }
207 return null;
208 }
209
210 public static Document getDOM(File in, EntityResolver er) {
211 // now we're always going to explicitly use GSConstants.XML_FILE_ENCODING (UTF-8)
212 // as the encoding to read in a file unless otherwise specified.
213 return getDOM(in, GSConstants.XML_FILE_ENCODING, er);
214 /*
215 try {
216 InputSource xml_source = new InputSource(new FileInputStream(in));
217 Document doc = getDOM(xml_source, er);
218 return doc;
219 }
220 catch (Exception e)
221 {
222 logger.error(e.getMessage());
223 logger.error("File was:\n" + in.getPath());
224 e.printStackTrace();
225 }
226 return null;
227 */
228 }
229
230 public static Document getDOM(InputSource source, EntityResolver er) {
231
232 try {
233 DOMParser parser = new DOMParser();
234 parser.setFeature("http://xml.org/sax/features/validation", false);
235 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
236 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
237 // a performance test showed that having this on lead to increased
238 // memory use for small-medium docs, and not much gain for large
239 // docs.
240 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
241 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
242 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
243 parser.setErrorHandler(new ParseErrorHandler());
244 if (er != null) {
245 parser.setEntityResolver(er);
246 }
247 parser.parse(source);
248
249 Document doc = parser.getDocument();
250 return doc;
251
252 } catch (Exception e) {
253
254 logger.error(e.getMessage());
255 logger.error("InputSource:\n");
256 e.printStackTrace();
257 }
258 return null;
259
260 }
261
262 /** creates a new empty DOM Document */
263 public static Document newDOM()
264 {
265 Document doc = new DocumentImpl();
266 return doc;
267 }
268
269 /**
270 * This method's parameters represent the parts of the Doctype of this
271 * Document that is to be created. For more info see
272 * http://xerces.apache.org
273 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
274 * .html#DocumentTypeImpl
275 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
276 *
277 * */
278 public static Document newDOM(String qualifiedName, String publicID, String systemID)
279 {
280 // create empty DOM document
281 DocumentImpl docImpl = new DocumentImpl();
282
283 // Need to use the document to create the docType for it
284 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
285
286 // Although we have created the docType using the document, we need to still
287 // put it into the empty document we just created
288 try
289 {
290 docImpl.appendChild(myDocType);
291 }
292 catch (Exception e)
293 {
294 System.out.println("Could not append docType because: " + e);
295 }
296
297 // return the document containing a DocType
298 return docImpl;
299 }
300
301 /** returns the Node as a String */
302 public static String getString(Node xmlNode)
303 {
304 StringBuffer xmlRepresentation = new StringBuffer();
305 getString(xmlNode, xmlRepresentation, 0, false);
306 return xmlRepresentation.toString();
307 }
308
309 /**
310 * returns the node as a nicely formatted String - this introduces extra
311 * text nodes if the String is read back in as a DOM, so should only be used
312 * for printing
313 */
314 public static String getPrettyString(Node xmlNode)
315 {
316 StringBuffer xmlRepresentation = new StringBuffer();
317 getString(xmlNode, xmlRepresentation, 0, true);
318 return xmlRepresentation.toString();
319 }
320
321 /*
322 * For the purposes of logger.debug statements, where this is called and
323 * hence outputted, returns an empty string if debugging is not enabled
324 */
325 public static String getPrettyStringLogger(Node xmlNode, Logger log)
326 {
327
328 if (log.isDebugEnabled())
329 return getPrettyString(xmlNode);
330
331 return "";
332
333 }
334
335 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
336 {
337
338 if (xmlNode == null)
339 {
340 xmlRepresentation.append("<null>");
341 return;
342 }
343
344 short nodeType = xmlNode.getNodeType();
345 String nodeName = xmlNode.getNodeName();
346
347 if (nodeType == Node.DOCUMENT_NODE)
348 {
349 Document xmlDocNode = (Document) xmlNode;
350
351 //if (xmlDocNode.getDoctype() == null) {
352 //System.err.println("Doctype is null.");
353 //}
354 //else {
355 if (xmlDocNode.getDoctype() != null)
356 {
357 DocumentType dt = xmlDocNode.getDoctype();
358
359 String name = dt.getName();
360 String pid = dt.getPublicId();
361 String sid = dt.getSystemId();
362
363 // Use previously assigned name, not dt.getName() again
364 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
365
366 xmlRepresentation.append(doctype_str);
367 }
368 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
369 return;
370 }
371 // Handle Element nodes
372 if (nodeType == Node.ELEMENT_NODE)
373 {
374 if (pretty)
375 {
376 xmlRepresentation.append("\n");
377 for (int i = 0; i < depth; i++)
378 {
379 xmlRepresentation.append(" ");
380 }
381 }
382
383 // Write opening tag
384 xmlRepresentation.append("<");
385 xmlRepresentation.append(nodeName);
386
387 // Write the node attributes
388 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
389 for (int i = 0; i < nodeAttributes.getLength(); i++)
390 {
391 Node attribute = nodeAttributes.item(i);
392 xmlRepresentation.append(" ");
393 xmlRepresentation.append(attribute.getNodeName());
394 xmlRepresentation.append("=\"");
395 String attr_val = attribute.getNodeValue();
396
397 attr_val = attr_val.replaceAll("&","&amp;");
398 attr_val = attr_val.replaceAll("<","&lt;");
399 attr_val = attr_val.replaceAll(">","&gt;");
400 attr_val = attr_val.replaceAll("\"","&quot;");
401
402 // assume that any of the above chars that was already entity escaped
403 // was already correct => return back to how they were
404 attr_val = attr_val.replaceAll("&amp;amp;","&amp;");
405 attr_val = attr_val.replaceAll("&amp;lt;","&lt;");
406 attr_val = attr_val.replaceAll("&amp;gt;","&gt;");
407 attr_val = attr_val.replaceAll("&amp;quot;","&quot;");
408 attr_val = attr_val.replaceAll("&amp;apos;","&apos;");
409
410
411 xmlRepresentation.append(attr_val);
412 xmlRepresentation.append("\"");
413 }
414
415 // If the node has no children, close the opening tag and return
416 if (xmlNode.hasChildNodes() == false)
417 {
418 // This produces somewhat ugly output, but it is necessary to compensate
419 // for display bugs in Netscape. Firstly, the space is needed before the
420 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
421 // example). Also, a newline character would be expected after the tag,
422 // but this causes problems with the display of links (the link text
423 // will contain a newline character, which is displayed badly).
424 xmlRepresentation.append(" />");
425 return;
426 }
427
428 // Close the opening tag
429 xmlRepresentation.append(">");
430
431 // Process the children. We process text nodes here, but recursively process other nodes.
432 // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
433 // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
434 // also if these are inside a pre tag then the space shows up in the page.
435
436 NodeList children = xmlNode.getChildNodes();
437 boolean do_pretty = pretty;
438 boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
439 for (int i = 0; i < children.getLength(); i++)
440 {
441 Node child = children.item(i);
442 short child_type = child.getNodeType();
443 if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
444 if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
445 output_escaping = false;
446 }
447 else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
448 output_escaping = true;
449 }
450 else {
451 logger.warn("Unhandled processing instruction " + child.getNodeName());
452 }
453 }
454 else if (child_type == Node.TEXT_NODE) {
455 do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
456 // output the text
457 String text = child.getNodeValue();
458
459 // Perform output escaping, if required
460 // Apache Commons replace method is far superior to String.replaceAll - very fast!
461 if (output_escaping) {
462 text = StringUtils.replace(text, "&", "&amp;");
463 text = StringUtils.replace(text, "<", "&lt;");
464 text = StringUtils.replace(text, ">", "&gt;");
465 text = StringUtils.replace(text, "'", "&apos;");
466 text = StringUtils.replace(text, "\"", "&quot;");
467 }
468 // Remove any control-C characters
469 text = StringUtils.replace(text, "" + (char) 3, "");
470
471 xmlRepresentation.append(text);
472
473 }
474 else {
475 // recursively call getString
476 getString(child, xmlRepresentation, depth + 1, do_pretty);
477 }
478 } // foreach child of the element
479
480 // Write closing tag
481 if (pretty)
482 {
483 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
484 {
485 for (int i = 0; i < depth; i++)
486 xmlRepresentation.append(" ");
487 }
488 }
489 xmlRepresentation.append("</");
490 xmlRepresentation.append(nodeName);
491 xmlRepresentation.append(">");
492 if (pretty)
493 {
494 xmlRepresentation.append("\n");
495 }
496 } // ELEMENT_NODE
497
498 else if (nodeType == Node.COMMENT_NODE)
499 {
500 String text = xmlNode.getNodeValue();
501 xmlRepresentation.append("<!-- ");
502 xmlRepresentation.append(text);
503 xmlRepresentation.append(" -->");
504 }
505
506 // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
507 // A type of node that is not handled yet
508 else
509 {
510 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
511 }
512
513 return;
514 }
515
516 protected static String getNodeTypeString(short node_type)
517 {
518
519 String type = "";
520 switch (node_type)
521 {
522 case Node.ATTRIBUTE_NODE:
523 type = "ATTRIBUTE_NODE";
524 break;
525 case Node.CDATA_SECTION_NODE:
526 type = "CDATA_SECTION_NODE";
527 break;
528 case Node.COMMENT_NODE:
529 type = "COMMENT_NODE";
530 break;
531 case Node.DOCUMENT_FRAGMENT_NODE:
532 type = "DOCUMENT_FRAGMENT_NODE";
533 break;
534 case Node.DOCUMENT_NODE:
535 type = "DOCUMENT_NODE";
536 break;
537 case Node.DOCUMENT_TYPE_NODE:
538 type = "DOCUMENT_TYPE_NODE";
539 break;
540 case Node.ELEMENT_NODE:
541 type = "ELEMENT_NODE";
542 break;
543 case Node.ENTITY_NODE:
544 type = "ENTITY_NODE";
545 break;
546 case Node.ENTITY_REFERENCE_NODE:
547 type = "ENTITY_REFERENCE_NODE";
548 break;
549 case Node.NOTATION_NODE:
550 type = "NOTATION_NODE";
551 break;
552 case Node.PROCESSING_INSTRUCTION_NODE:
553 type = "PROCESSING_INSTRUCTION_NODE";
554 break;
555 case Node.TEXT_NODE:
556 type = "TEXT_NODE";
557 break;
558 default:
559 type = "UNKNOWN";
560 }
561
562 return type;
563 }
564
565 // returns null if there no error occurred during parsing, or else returns the error message
566
567 // public String getParseErrorMessage()
568 // {
569 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
570 // return errorHandler.getErrorMessage();
571 // }
572
573 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
574 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
575 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
576 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
577 static public class ParseErrorHandler implements ErrorHandler
578 {
579 protected String errorMessage = null;
580
581 // Receive notification of a recoverable error.
582 public void error(SAXParseException exception)
583 {
584 handleError("Error:\n", exception);
585 }
586
587 // Receive notification of a non-recoverable error.
588 public void fatalError(SAXParseException exception)
589 {
590 handleError("Fatal Error:\n", exception);
591 }
592
593 // Receive notification of a warning.
594 public void warning(SAXParseException exception)
595 {
596 handleError("Warning:\n", exception);
597 }
598
599 public String toString(SAXParseException e)
600 {
601 String msg = e.getMessage();
602 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
603 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
604 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
605
606 return msg;
607 }
608
609 // clears the errorPage variable after first call to this method
610 public String getErrorMessage()
611 {
612 String errMsg = this.errorMessage;
613 if (this.errorMessage != null)
614 {
615 this.errorMessage = null;
616 }
617 return errMsg;
618 }
619
620 // sets the errorMessage member variable to the data stored in the exception
621 // and writes the errorMessage to the logger and tomcat's System.err
622 protected void handleError(String errorType, SAXParseException exception)
623 {
624 this.errorMessage = errorType + toString(exception);
625 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
626 logger.error(this.errorMessage);
627 }
628 }
629
630 public static boolean writeDOM(Element elem, File file) {
631
632 BufferedWriter writer = null;
633 boolean success = false;
634 try {
635 String xml_string = getString(elem);
636 // need createNewFile???
637 writer = new BufferedWriter(new FileWriter(file));
638 writer.write(xml_string);
639 success = true;
640 }
641
642 catch (Exception e) {
643 logger.error(e.getMessage());
644 success = false;
645 }
646 finally {
647 try {
648 if (writer != null) {
649 writer.close();
650 }
651 } catch(Exception e) {
652 logger.error("couldn't close the file"+e.getMessage());
653 }
654 }
655 return success;
656 }
657}
Note: See TracBrowser for help on using the repository browser.