source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28382

Last change on this file since 28382 was 28382, checked in by davidb, 11 years ago

Elimination of the 'this.doc' field from the Action baseclass and the subclasses that rely on it. For Greenstone3 purposes it is unsafe to create this object in the constructor to the action and then store it for other methods to access. This is because the Greenstone 3 (and in particular calls to 'process' operate in a multi-threaded context, that is managed by the Servlet server (e.g. Tomcat by default). Calls to DOM methods are not guaranteed to be thread safe, this became apparent when we started looking in to an exception that was being thrown, and centred around use of the DOM method 'item(i)'. The change this commit makes is to remove 'this.doc' being stored as a field. A document is now created in the top level of a call to 'process()' and when a DOM reference is needed in a subsequent method an Element variable (typically passed in as a parameter to the method) is used (through 'Document doc = element.getOwnerDocument()') to gain access to the DOM

  • Property svn:keywords set to Author Date Id Revision
File size: 17.4 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.apache.xerces.parsers.DOMParser;
34import org.apache.xerces.dom.*; // for new Documents
35
36// other java classes
37import java.io.ByteArrayInputStream;
38import java.io.InputStream;
39import java.io.Reader;
40import java.io.InputStreamReader;
41import java.io.StringReader;
42import java.io.File;
43import java.io.FileInputStream;
44import java.io.FileReader;
45import java.util.regex.*;
46
47import org.apache.log4j.*;
48
49// Apache Commons
50import org.apache.commons.lang3.*;
51
52import java.util.*;
53import java.lang.reflect.*;
54
55/**
56 * XMLConverter - utility class for greenstone
57 *
58 * parses XML Strings into Documents, converts Nodes to Strings different
59 * parsers have different behaviour - can experiment in here now we only use
60 * xerces
61 *
62 */
63public class XMLConverter
64{
65
66 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
67
68 protected EntityResolver resolver = null;
69
70 /** xerces parser */
71 protected DOMParser parser = null;
72
73 private static boolean outputEscaping = true;
74
75 /** the no-args constructor */
76 public XMLConverter()
77 {
78 try
79 {
80 this.parser = new DOMParser();
81 this.parser.setFeature("http://xml.org/sax/features/validation", false);
82 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
83 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
84 // a performance test showed that having this on lead to increased
85 // memory use for small-medium docs, and not much gain for large
86 // docs.
87 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
88 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
89 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
90 // this errormessage can then be converted to xhtml and displayed in a browser.
91 this.parser.setErrorHandler(new ParseErrorHandler());
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 }
97 }
98
99 /** sets the entity resolver. pass in null to unset it */
100 public void setEntityResolver(EntityResolver er)
101 {
102 this.resolver = er;
103 this.parser.setEntityResolver(er);
104 }
105
106 /**
107 * Given a Node representing an Element or Document, will return the
108 * Element/docroot Element. Returns null if the Node was not an element.
109 */
110 public static Element nodeToElement(Node node)
111 {
112 if (node == null)
113 {
114 return null;
115 }
116 short nodeType = node.getNodeType();
117
118 if (nodeType == Node.DOCUMENT_NODE)
119 {
120 Document docNode = (Document) node;
121 return docNode.getDocumentElement();
122 }
123 else if (nodeType == Node.ELEMENT_NODE)
124 {
125 return (Element) node;
126 }
127 else
128 {
129 String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
130 System.err.println(message);
131 logger.warn(message);
132 return null;
133 }
134 }
135
136 /** returns a DOM Document */
137 public Document getDOM(String in)
138 {
139
140 try
141 {
142 Reader reader = new StringReader(in);
143 InputSource xml_source = new InputSource(reader);
144
145 DOMParser parser = new DOMParser();
146 parser.setFeature("http://xml.org/sax/features/validation", false);
147 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
148 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
149 if (resolver != null)
150 {
151 parser.setEntityResolver(this.resolver);
152 }
153 parser.setErrorHandler(new ParseErrorHandler());
154 parser.parse(xml_source);
155
156 Document doc = parser.getDocument();
157
158 reader.close();
159 return doc;
160
161 }
162 catch (Exception e)
163 {
164 logger.error(e.getMessage());
165 }
166 return null;
167 }
168
169 /** returns a DOM Document */
170 public Document getDOM(String in, String encoding)
171 {
172 try
173 {
174 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
175 InputSource xml_source = new InputSource(reader);
176
177 DOMParser parser = new DOMParser();
178 parser.setFeature("http://xml.org/sax/features/validation", false);
179 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
180 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
181 if (resolver != null)
182 {
183 parser.setEntityResolver(this.resolver);
184 }
185 parser.setErrorHandler(new ParseErrorHandler());
186 parser.parse(xml_source);
187
188 Document doc = parser.getDocument();
189
190 reader.close();
191 return doc;
192
193 }
194 catch (Exception e)
195 {
196 logger.error(e.getMessage());
197 }
198 return null;
199 }
200
201 /** returns a DOM Document */
202 public Document getDOM(File in)
203 {
204 try
205 {
206 FileReader reader = new FileReader(in);
207 InputSource xml_source = new InputSource(reader);
208
209 DOMParser parser = new DOMParser();
210 parser.setFeature("http://xml.org/sax/features/validation", false);
211 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
212 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
213 if (resolver != null)
214 {
215 parser.setEntityResolver(this.resolver);
216 }
217 parser.setErrorHandler(new ParseErrorHandler());
218 parser.parse(xml_source);
219
220 Document doc = parser.getDocument();
221
222 reader.close();
223 return doc;
224
225 }
226 catch (Exception e)
227 {
228 logger.error(e.getMessage(), e);
229
230 }
231 return null;
232 }
233
234 /** returns a DOM document */
235 public Document getDOM(File in, String encoding)
236 {
237 try
238 {
239
240 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
241 InputSource xml_source = new InputSource(isr);
242
243 DOMParser parser = new DOMParser();
244 parser.setFeature("http://xml.org/sax/features/validation", false);
245 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
246 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
247 if (resolver != null)
248 {
249 parser.setEntityResolver(this.resolver);
250 }
251 parser.setErrorHandler(new ParseErrorHandler());
252 parser.parse(xml_source);
253
254 Document doc = parser.getDocument();
255
256 isr.close();
257 return doc;
258
259 }
260 catch (Exception e)
261 {
262 logger.error(e.getMessage());
263 }
264 return null;
265 }
266
267 /** creates a new empty DOM Document */
268 public Document newDOM()
269 {
270 Document doc = new DocumentImpl();
271 return doc;
272 }
273
274 /**
275 * This method's parameters represent the parts of the Doctype of this
276 * Document that is to be created. For more info see
277 * http://xerces.apache.org
278 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
279 * .html#DocumentTypeImpl
280 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
281 *
282 * */
283 public static Document newDOM(String qualifiedName, String publicID, String systemID)
284 {
285 // create empty DOM document
286 DocumentImpl docImpl = new DocumentImpl();
287
288 // Need to use the document to create the docType for it
289 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
290
291 // Although we have created the docType using the document, we need to still
292 // put it into the empty document we just created
293 try
294 {
295 docImpl.appendChild(myDocType);
296 }
297 catch (Exception e)
298 {
299 System.out.println("Could not append docType because: " + e);
300 }
301
302 // return the document containing a DocType
303 return docImpl;
304 }
305
306 /** returns the Node as a String */
307 public static String getString(Node xmlNode)
308 {
309 outputEscaping = true;
310 StringBuffer xmlRepresentation = new StringBuffer();
311 getString(xmlNode, xmlRepresentation, 0, false);
312 return xmlRepresentation.toString();
313 }
314
315 /**
316 * returns the node as a nicely formatted String - this introduces extra
317 * text nodes if the String is read back in as a DOM, so should only be used
318 * for printing
319 */
320 public static String getPrettyString(Node xmlNode)
321 {
322
323 outputEscaping = true;
324 StringBuffer xmlRepresentation = new StringBuffer();
325 getString(xmlNode, xmlRepresentation, 0, true);
326 return xmlRepresentation.toString();
327 }
328
329 /*
330 * For the purposes of logger.debug statements, where this is called and
331 * hence outputted, returns an empty string if debugging is not enabled
332 */
333 public static String getPrettyStringLogger(Node xmlNode, Logger log)
334 {
335
336 if (log.isDebugEnabled())
337 return getPrettyString(xmlNode);
338
339 return "";
340
341 }
342
343 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
344 {
345
346 if (xmlNode == null)
347 {
348 xmlRepresentation.append("<null>");
349 return;
350 }
351
352 short nodeType = xmlNode.getNodeType();
353 String nodeName = xmlNode.getNodeName();
354
355 if (nodeType == Node.DOCUMENT_NODE)
356 {
357 Document xmlDocNode = (Document) xmlNode;
358
359 //if (xmlDocNode.getDoctype() == null) {
360 //System.err.println("Doctype is null.");
361 //}
362 //else {
363 if (xmlDocNode.getDoctype() != null)
364 {
365 DocumentType dt = xmlDocNode.getDoctype();
366
367 String name = dt.getName();
368 String pid = dt.getPublicId();
369 String sid = dt.getSystemId();
370
371 // Use previously assigned name, not dt.getName() again
372 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
373
374 xmlRepresentation.append(doctype_str);
375 }
376 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
377 return;
378 }
379 // Handle Element nodes
380 if (nodeType == Node.ELEMENT_NODE)
381 {
382 if (pretty)
383 {
384 xmlRepresentation.append("\n");
385 for (int i = 0; i < depth; i++)
386 {
387 xmlRepresentation.append(" ");
388 }
389 }
390
391 // Write opening tag
392 xmlRepresentation.append("<");
393 xmlRepresentation.append(nodeName);
394
395 // Write the node attributes
396 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
397 for (int i = 0; i < nodeAttributes.getLength(); i++)
398 {
399 Node attribute = nodeAttributes.item(i);
400 xmlRepresentation.append(" ");
401 xmlRepresentation.append(attribute.getNodeName());
402 xmlRepresentation.append("=\"");
403 xmlRepresentation.append(attribute.getNodeValue());
404 xmlRepresentation.append("\"");
405 }
406
407 // If the node has no children, close the opening tag and return
408 if (xmlNode.hasChildNodes() == false)
409 {
410 // This produces somewhat ugly output, but it is necessary to compensate
411 // for display bugs in Netscape. Firstly, the space is needed before the
412 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
413 // example). Also, a newline character would be expected after the tag,
414 // but this causes problems with the display of links (the link text
415 // will contain a newline character, which is displayed badly).
416 xmlRepresentation.append(" />");
417 return;
418 }
419
420 // Close the opening tag
421 xmlRepresentation.append(">");
422
423 // Apply recursively to the children of this node
424 // hack for nodes next to text nodes - dont make them pretty
425 // this is needed for text inside a <pre> element - any new lines
426 // or spaces around the span elements show up in the text
427 NodeList children = xmlNode.getChildNodes();
428 boolean do_pretty = pretty;
429 for (int i = 0; i < children.getLength(); i++)
430 {
431 if (children.item(i).getNodeType() == Node.TEXT_NODE)
432 {
433 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
434 }
435 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
436 }
437
438 // Write closing tag
439 if (pretty)
440 {
441 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
442 {
443 for (int i = 0; i < depth; i++)
444 xmlRepresentation.append(" ");
445 }
446 }
447 xmlRepresentation.append("</");
448 xmlRepresentation.append(nodeName);
449 xmlRepresentation.append(">");
450 if (pretty)
451 {
452 xmlRepresentation.append("\n");
453 }
454 }
455
456 // Handle Text nodes
457 else if (nodeType == Node.TEXT_NODE)
458 {
459 String text = xmlNode.getNodeValue();
460
461 // Perform output escaping, if required
462 // Apache Commons replace method is far superior to String.replaceAll - very fast!
463 if (outputEscaping)
464 {
465
466 text = StringUtils.replace(text, "&", "&amp;");
467 text = StringUtils.replace(text, "<", "&lt;");
468 text = StringUtils.replace(text, ">", "&gt;");
469 text = StringUtils.replace(text, "'", "&apos;");
470 text = StringUtils.replace(text, "\"", "&quot;");
471 }
472
473 // Remove any control-C characters
474 text = StringUtils.replace(text, "" + (char) 3, "");
475
476 xmlRepresentation.append(text);
477 }
478
479 // Handle Processing Instruction nodes
480 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
481 {
482 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
483 {
484 outputEscaping = false;
485 }
486 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
487 {
488 outputEscaping = true;
489 }
490 else
491 {
492 logger.warn("Unhandled processing instruction " + nodeName);
493 }
494 }
495
496 else if (nodeType == Node.COMMENT_NODE)
497 {
498 String text = xmlNode.getNodeValue();
499 xmlRepresentation.append("<!-- ");
500 xmlRepresentation.append(text);
501 xmlRepresentation.append(" -->");
502 }
503
504 // A type of node that is not handled yet
505 else
506 {
507 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
508 }
509
510 return;
511 }
512
513 protected static String getNodeTypeString(short node_type)
514 {
515
516 String type = "";
517 switch (node_type)
518 {
519 case Node.ATTRIBUTE_NODE:
520 type = "ATTRIBUTE_NODE";
521 break;
522 case Node.CDATA_SECTION_NODE:
523 type = "CDATA_SECTION_NODE";
524 break;
525 case Node.COMMENT_NODE:
526 type = "COMMENT_NODE";
527 break;
528 case Node.DOCUMENT_FRAGMENT_NODE:
529 type = "DOCUMENT_FRAGMENT_NODE";
530 break;
531 case Node.DOCUMENT_NODE:
532 type = "DOCUMENT_NODE";
533 break;
534 case Node.DOCUMENT_TYPE_NODE:
535 type = "DOCUMENT_TYPE_NODE";
536 break;
537 case Node.ELEMENT_NODE:
538 type = "ELEMENT_NODE";
539 break;
540 case Node.ENTITY_NODE:
541 type = "ENTITY_NODE";
542 break;
543 case Node.ENTITY_REFERENCE_NODE:
544 type = "ENTITY_REFERENCE_NODE";
545 break;
546 case Node.NOTATION_NODE:
547 type = "NOTATION_NODE";
548 break;
549 case Node.PROCESSING_INSTRUCTION_NODE:
550 type = "PROCESSING_INSTRUCTION_NODE";
551 break;
552 case Node.TEXT_NODE:
553 type = "TEXT_NODE";
554 break;
555 default:
556 type = "UNKNOWN";
557 }
558
559 return type;
560 }
561
562 // returns null if there no error occurred during parsing, or else returns the error message
563 public String getParseErrorMessage()
564 {
565 ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
566 return errorHandler.getErrorMessage();
567 }
568
569 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
570 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
571 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
572 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
573 static public class ParseErrorHandler implements ErrorHandler
574 {
575 protected String errorMessage = null;
576
577 // Receive notification of a recoverable error.
578 public void error(SAXParseException exception)
579 {
580 handleError("Error:\n", exception);
581 }
582
583 // Receive notification of a non-recoverable error.
584 public void fatalError(SAXParseException exception)
585 {
586 handleError("Fatal Error:\n", exception);
587 }
588
589 // Receive notification of a warning.
590 public void warning(SAXParseException exception)
591 {
592 handleError("Warning:\n", exception);
593 }
594
595 public String toString(SAXParseException e)
596 {
597 String msg = e.getMessage();
598 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
599 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
600 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
601
602 return msg;
603 }
604
605 // clears the errorPage variable after first call to this method
606 public String getErrorMessage()
607 {
608 String errMsg = this.errorMessage;
609 if (this.errorMessage != null)
610 {
611 this.errorMessage = null;
612 }
613 return errMsg;
614 }
615
616 // sets the errorMessage member variable to the data stored in the exception
617 // and writes the errorMessage to the logger and tomcat's System.err
618 protected void handleError(String errorType, SAXParseException exception)
619 {
620 this.errorMessage = errorType + toString(exception);
621 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
622 logger.error(this.errorMessage);
623 }
624 }
625}
Note: See TracBrowser for help on using the repository browser.