source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 33897

Last change on this file since 33897 was 33897, checked in by kjdon, 4 years ago

elsewhere in the code - GSXML.xmlSafe, we are escaping ' => ' we need to make sure we recover this too, otherwise get ' in the interface eg in search box if we have an apostrophe in search term.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.xml.sax.SAXNotRecognizedException;
34import org.xml.sax.SAXNotSupportedException;
35import org.apache.xerces.parsers.DOMParser;
36import org.apache.xerces.dom.DocumentImpl; // for new Documents
37import org.apache.xerces.dom.DocumentTypeImpl;
38
39// other java classes
40import java.io.BufferedWriter;
41import java.io.ByteArrayInputStream;
42import java.io.FileWriter;
43import java.io.InputStream;
44import java.io.Reader;
45import java.io.InputStreamReader;
46import java.io.StringReader;
47import java.io.File;
48import java.io.FileInputStream;
49import java.io.FileReader;
50import java.util.regex.*;
51
52import org.apache.log4j.*;
53
54// Apache Commons
55import org.apache.commons.lang3.*;
56
57import java.util.*;
58import java.lang.reflect.*;
59
60/**
61 * XMLConverter - utility class for greenstone
62 *
63 * generates new Documents
64 * parses XML Strings into Documents, converts Nodes to Strings
65 * different parsers have different behaviour - can experiment in here
66 * at the moment we only use xerces
67 * all xerces specific code is in here
68 */
69public class XMLConverter
70{
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74 /** the no-args constructor */
75 public XMLConverter()
76 {
77
78 }
79
80 /** returns a DOM Document */
81 public static Document getDOM(String in)
82 {
83
84 try
85 {
86 Reader reader = new StringReader(in);
87 InputSource xml_source = new InputSource(reader);
88 Document doc = getDOM(xml_source, null);
89 reader.close();
90 return doc;
91
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 logger.error("Input string was:\n" + in);
97 e.printStackTrace();
98 }
99 return null;
100 }
101
102 /** returns a DOM Document */
103 public static Document getDOM(String in, String encoding)
104 {
105 try
106 {
107 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
108 InputSource xml_source = new InputSource(reader);
109 Document doc = getDOM(xml_source, null);
110 reader.close();
111 return doc;
112
113 }
114 catch (Exception e)
115 {
116 logger.error(e.getMessage());
117 logger.error("Input string was:\n" + in);
118 e.printStackTrace();
119 }
120 return null;
121 }
122
123 /** returns a DOM Document */
124 public static Document getDOM(File in) {
125 try
126 {
127 FileReader reader = new FileReader(in);
128 InputSource xml_source = new InputSource(reader);
129 Document doc = getDOM(xml_source, null);
130 reader.close();
131 return doc;
132
133 }
134 catch (Exception e)
135 {
136 logger.error(e.getMessage(), e);
137 logger.error("File was:\n" + in.getPath());
138 e.printStackTrace();
139
140 }
141 return null;
142 }
143
144 public static Document getDOM(File in, String encoding) {
145 return getDOM(in, encoding, null);
146 }
147
148 /** returns a DOM document */
149 public static Document getDOM(File in, String encoding, EntityResolver er) {
150
151 try {
152
153
154 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
155 InputSource xml_source = new InputSource(isr);
156 Document doc = getDOM(xml_source, er);
157 isr.close();
158 return doc;
159
160 }
161 catch (Exception e)
162 {
163 logger.error(e.getMessage());
164 logger.error("File was:\n" + in.getPath());
165 e.printStackTrace();
166 }
167 return null;
168 }
169
170 public static Document getDOM(File in, EntityResolver er) {
171
172 try {
173 InputSource xml_source = new InputSource(new FileInputStream(in));
174 Document doc = getDOM(xml_source, er);
175 return doc;
176 }
177 catch (Exception e)
178 {
179 logger.error(e.getMessage());
180 logger.error("File was:\n" + in.getPath());
181 e.printStackTrace();
182 }
183 return null;
184 }
185
186 public static Document getDOM(InputSource source, EntityResolver er) {
187
188 try {
189 DOMParser parser = new DOMParser();
190 parser.setFeature("http://xml.org/sax/features/validation", false);
191 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
192 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
193 // a performance test showed that having this on lead to increased
194 // memory use for small-medium docs, and not much gain for large
195 // docs.
196 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
197 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
198 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
199 parser.setErrorHandler(new ParseErrorHandler());
200 if (er != null) {
201 parser.setEntityResolver(er);
202 }
203 parser.parse(source);
204
205 Document doc = parser.getDocument();
206 return doc;
207
208 } catch (Exception e) {
209
210 logger.error(e.getMessage());
211 logger.error("InputSource:\n");
212 e.printStackTrace();
213 }
214 return null;
215
216 }
217
218 /** creates a new empty DOM Document */
219 public static Document newDOM()
220 {
221 Document doc = new DocumentImpl();
222 return doc;
223 }
224
225 /**
226 * This method's parameters represent the parts of the Doctype of this
227 * Document that is to be created. For more info see
228 * http://xerces.apache.org
229 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
230 * .html#DocumentTypeImpl
231 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
232 *
233 * */
234 public static Document newDOM(String qualifiedName, String publicID, String systemID)
235 {
236 // create empty DOM document
237 DocumentImpl docImpl = new DocumentImpl();
238
239 // Need to use the document to create the docType for it
240 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
241
242 // Although we have created the docType using the document, we need to still
243 // put it into the empty document we just created
244 try
245 {
246 docImpl.appendChild(myDocType);
247 }
248 catch (Exception e)
249 {
250 System.out.println("Could not append docType because: " + e);
251 }
252
253 // return the document containing a DocType
254 return docImpl;
255 }
256
257 /** returns the Node as a String */
258 public static String getString(Node xmlNode)
259 {
260 StringBuffer xmlRepresentation = new StringBuffer();
261 getString(xmlNode, xmlRepresentation, 0, false);
262 return xmlRepresentation.toString();
263 }
264
265 /**
266 * returns the node as a nicely formatted String - this introduces extra
267 * text nodes if the String is read back in as a DOM, so should only be used
268 * for printing
269 */
270 public static String getPrettyString(Node xmlNode)
271 {
272 StringBuffer xmlRepresentation = new StringBuffer();
273 getString(xmlNode, xmlRepresentation, 0, true);
274 return xmlRepresentation.toString();
275 }
276
277 /*
278 * For the purposes of logger.debug statements, where this is called and
279 * hence outputted, returns an empty string if debugging is not enabled
280 */
281 public static String getPrettyStringLogger(Node xmlNode, Logger log)
282 {
283
284 if (log.isDebugEnabled())
285 return getPrettyString(xmlNode);
286
287 return "";
288
289 }
290
291 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
292 {
293
294 if (xmlNode == null)
295 {
296 xmlRepresentation.append("<null>");
297 return;
298 }
299
300 short nodeType = xmlNode.getNodeType();
301 String nodeName = xmlNode.getNodeName();
302
303 if (nodeType == Node.DOCUMENT_NODE)
304 {
305 Document xmlDocNode = (Document) xmlNode;
306
307 //if (xmlDocNode.getDoctype() == null) {
308 //System.err.println("Doctype is null.");
309 //}
310 //else {
311 if (xmlDocNode.getDoctype() != null)
312 {
313 DocumentType dt = xmlDocNode.getDoctype();
314
315 String name = dt.getName();
316 String pid = dt.getPublicId();
317 String sid = dt.getSystemId();
318
319 // Use previously assigned name, not dt.getName() again
320 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
321
322 xmlRepresentation.append(doctype_str);
323 }
324 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
325 return;
326 }
327 // Handle Element nodes
328 if (nodeType == Node.ELEMENT_NODE)
329 {
330 if (pretty)
331 {
332 xmlRepresentation.append("\n");
333 for (int i = 0; i < depth; i++)
334 {
335 xmlRepresentation.append(" ");
336 }
337 }
338
339 // Write opening tag
340 xmlRepresentation.append("<");
341 xmlRepresentation.append(nodeName);
342
343 // Write the node attributes
344 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
345 for (int i = 0; i < nodeAttributes.getLength(); i++)
346 {
347 Node attribute = nodeAttributes.item(i);
348 xmlRepresentation.append(" ");
349 xmlRepresentation.append(attribute.getNodeName());
350 xmlRepresentation.append("=\"");
351 String attr_val = attribute.getNodeValue();
352
353 attr_val = attr_val.replaceAll("&","&amp;");
354 attr_val = attr_val.replaceAll("<","&lt;");
355 attr_val = attr_val.replaceAll(">","&gt;");
356 attr_val = attr_val.replaceAll("\"","&quot;");
357
358 // assume that any of the above chars that was already entity escaped
359 // was already correct => return back to how they were
360 attr_val = attr_val.replaceAll("&amp;amp;","&amp;");
361 attr_val = attr_val.replaceAll("&amp;lt;","&lt;");
362 attr_val = attr_val.replaceAll("&amp;gt;","&gt;");
363 attr_val = attr_val.replaceAll("&amp;quot;","&quot;");
364 attr_val = attr_val.replaceAll("&amp;apos;","&apos;");
365
366
367 xmlRepresentation.append(attr_val);
368 xmlRepresentation.append("\"");
369 }
370
371 // If the node has no children, close the opening tag and return
372 if (xmlNode.hasChildNodes() == false)
373 {
374 // This produces somewhat ugly output, but it is necessary to compensate
375 // for display bugs in Netscape. Firstly, the space is needed before the
376 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
377 // example). Also, a newline character would be expected after the tag,
378 // but this causes problems with the display of links (the link text
379 // will contain a newline character, which is displayed badly).
380 xmlRepresentation.append(" />");
381 return;
382 }
383
384 // Close the opening tag
385 xmlRepresentation.append(">");
386
387 // Process the children. We process text nodes here, but recursively process other nodes.
388 // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
389 // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
390 // also if these are inside a pre tag then the space shows up in the page.
391
392 NodeList children = xmlNode.getChildNodes();
393 boolean do_pretty = pretty;
394 boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
395 for (int i = 0; i < children.getLength(); i++)
396 {
397 Node child = children.item(i);
398 short child_type = child.getNodeType();
399 if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
400 if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
401 output_escaping = false;
402 }
403 else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
404 output_escaping = true;
405 }
406 else {
407 logger.warn("Unhandled processing instruction " + child.getNodeName());
408 }
409 }
410 else if (child_type == Node.TEXT_NODE) {
411 do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
412 // output the text
413 String text = child.getNodeValue();
414
415 // Perform output escaping, if required
416 // Apache Commons replace method is far superior to String.replaceAll - very fast!
417 if (output_escaping) {
418 text = StringUtils.replace(text, "&", "&amp;");
419 text = StringUtils.replace(text, "<", "&lt;");
420 text = StringUtils.replace(text, ">", "&gt;");
421 text = StringUtils.replace(text, "'", "&apos;");
422 text = StringUtils.replace(text, "\"", "&quot;");
423 }
424 // Remove any control-C characters
425 text = StringUtils.replace(text, "" + (char) 3, "");
426
427 xmlRepresentation.append(text);
428
429 }
430 else {
431 // recursively call getString
432 getString(child, xmlRepresentation, depth + 1, do_pretty);
433 }
434 } // foreach child of the element
435
436 // Write closing tag
437 if (pretty)
438 {
439 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
440 {
441 for (int i = 0; i < depth; i++)
442 xmlRepresentation.append(" ");
443 }
444 }
445 xmlRepresentation.append("</");
446 xmlRepresentation.append(nodeName);
447 xmlRepresentation.append(">");
448 if (pretty)
449 {
450 xmlRepresentation.append("\n");
451 }
452 } // ELEMENT_NODE
453
454 else if (nodeType == Node.COMMENT_NODE)
455 {
456 String text = xmlNode.getNodeValue();
457 xmlRepresentation.append("<!-- ");
458 xmlRepresentation.append(text);
459 xmlRepresentation.append(" -->");
460 }
461
462 // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
463 // A type of node that is not handled yet
464 else
465 {
466 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
467 }
468
469 return;
470 }
471
472 protected static String getNodeTypeString(short node_type)
473 {
474
475 String type = "";
476 switch (node_type)
477 {
478 case Node.ATTRIBUTE_NODE:
479 type = "ATTRIBUTE_NODE";
480 break;
481 case Node.CDATA_SECTION_NODE:
482 type = "CDATA_SECTION_NODE";
483 break;
484 case Node.COMMENT_NODE:
485 type = "COMMENT_NODE";
486 break;
487 case Node.DOCUMENT_FRAGMENT_NODE:
488 type = "DOCUMENT_FRAGMENT_NODE";
489 break;
490 case Node.DOCUMENT_NODE:
491 type = "DOCUMENT_NODE";
492 break;
493 case Node.DOCUMENT_TYPE_NODE:
494 type = "DOCUMENT_TYPE_NODE";
495 break;
496 case Node.ELEMENT_NODE:
497 type = "ELEMENT_NODE";
498 break;
499 case Node.ENTITY_NODE:
500 type = "ENTITY_NODE";
501 break;
502 case Node.ENTITY_REFERENCE_NODE:
503 type = "ENTITY_REFERENCE_NODE";
504 break;
505 case Node.NOTATION_NODE:
506 type = "NOTATION_NODE";
507 break;
508 case Node.PROCESSING_INSTRUCTION_NODE:
509 type = "PROCESSING_INSTRUCTION_NODE";
510 break;
511 case Node.TEXT_NODE:
512 type = "TEXT_NODE";
513 break;
514 default:
515 type = "UNKNOWN";
516 }
517
518 return type;
519 }
520
521 // returns null if there no error occurred during parsing, or else returns the error message
522
523 // public String getParseErrorMessage()
524 // {
525 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
526 // return errorHandler.getErrorMessage();
527 // }
528
529 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
530 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
531 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
532 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
533 static public class ParseErrorHandler implements ErrorHandler
534 {
535 protected String errorMessage = null;
536
537 // Receive notification of a recoverable error.
538 public void error(SAXParseException exception)
539 {
540 handleError("Error:\n", exception);
541 }
542
543 // Receive notification of a non-recoverable error.
544 public void fatalError(SAXParseException exception)
545 {
546 handleError("Fatal Error:\n", exception);
547 }
548
549 // Receive notification of a warning.
550 public void warning(SAXParseException exception)
551 {
552 handleError("Warning:\n", exception);
553 }
554
555 public String toString(SAXParseException e)
556 {
557 String msg = e.getMessage();
558 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
559 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
560 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
561
562 return msg;
563 }
564
565 // clears the errorPage variable after first call to this method
566 public String getErrorMessage()
567 {
568 String errMsg = this.errorMessage;
569 if (this.errorMessage != null)
570 {
571 this.errorMessage = null;
572 }
573 return errMsg;
574 }
575
576 // sets the errorMessage member variable to the data stored in the exception
577 // and writes the errorMessage to the logger and tomcat's System.err
578 protected void handleError(String errorType, SAXParseException exception)
579 {
580 this.errorMessage = errorType + toString(exception);
581 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
582 logger.error(this.errorMessage);
583 }
584 }
585
586 public static boolean writeDOM(Element elem, File file) {
587
588 BufferedWriter writer = null;
589 boolean success = false;
590 try {
591 String xml_string = getString(elem);
592 // need createNewFile???
593 writer = new BufferedWriter(new FileWriter(file));
594 writer.write(xml_string);
595 success = true;
596 }
597
598 catch (Exception e) {
599 logger.error(e.getMessage());
600 success = false;
601 }
602 finally {
603 try {
604 if (writer != null) {
605 writer.close();
606 }
607 } catch(Exception e) {
608 logger.error("couldn't close the file"+e.getMessage());
609 }
610 }
611 return success;
612 }
613}
Note: See TracBrowser for help on using the repository browser.