source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 18434

Last change on this file since 18434 was 18434, checked in by max, 15 years ago
  1. Provides a new DOM method that allows a document to be created with a specified DOCTYPE.
  2. Reusable ErrorHandler class for parsing, that will deal with warnings, fatal and regular errors that may occur when parsing XML.
  3. XMLConverter's parse object has an ErrorHandler attached to it when initialised.
  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.apache.xerces.parsers.DOMParser;
34import org.apache.xerces.dom.*; // for new Documents
35
36// other java classes
37import java.io.Reader;
38import java.io.InputStreamReader;
39import java.io.StringReader;
40import java.io.File;
41import java.io.FileInputStream;
42import java.io.FileReader;
43
44import org.apache.log4j.*;
45
46/** XMLConverter - utility class for greenstone
47 *
48 * parses XML Strings into Documents, converts Nodes to Strings
49 * different parsers have different behaviour - can experiment in here
50 * now we only use xerces
51 *
52 */
53public class XMLConverter {
54
55 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57 /** xerces parser */
58 protected DOMParser parser = null;
59
60 private static boolean outputEscaping = true;
61
62
63 /** the no-args constructor */
64 public XMLConverter() {
65 try {
66 this.parser = new DOMParser();
67 this.parser.setFeature("http://xml.org/sax/features/validation", false);
68 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70 // a performance test showed that having this on lead to increased
71 // memory use for small-medium docs, and not much gain for large
72 // docs.
73 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75 // add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
76 // this errormessage can then be converted to xhtml and displayed in a browser.
77 this.parser.setErrorHandler(new ParseErrorHandler());
78 } catch (Exception e) {
79 logger.error(e.getMessage());
80 }
81 }
82
83 /** sets the entity resolver. pass in null to unset it */
84 public void setEntityResolver(EntityResolver er) {
85 this.parser.setEntityResolver(er);
86 }
87
88 /** Given a Node representing an Element or Document, will return the
89 * Element/docroot Element. Returns null if the Node was not an element. */
90 public static Element nodeToElement(Node node)
91 {
92 if(node == null) {
93 return null;
94 }
95 short nodeType = node.getNodeType();
96
97 if (nodeType == Node.DOCUMENT_NODE) {
98 Document docNode = (Document)node;
99 return docNode.getDocumentElement() ;
100 }
101 else if (nodeType == Node.ELEMENT_NODE) {
102 return (Element)node;
103 }
104 else {
105 String message = "Expecting Document or Element node type but got "
106 + node.getNodeName() + "\nReturning null";
107 System.err.println(message);
108 logger.warn(message);
109 return null;
110 }
111 }
112
113 /** returns a DOM Document */
114 public Document getDOM(String in) {
115
116 try {
117 Reader reader = new StringReader(in);
118 InputSource xml_source = new InputSource(reader);
119
120 this.parser.parse(xml_source);
121 Document doc = this.parser.getDocument();
122
123 return doc;
124
125 } catch (Exception e) {
126 logger.error(e.getMessage());
127 }
128 return null;
129 }
130
131 /** returns a DOM Document */
132 public Document getDOM(File in) {
133 try {
134 FileReader reader = new FileReader(in);
135 InputSource xml_source = new InputSource(reader);
136 this.parser.parse(xml_source);
137 Document doc = this.parser.getDocument();
138 return doc;
139
140 } catch (Exception e) {
141 logger.error(e.getMessage(), e);
142
143 }
144 return null;
145 }
146
147 /** returns a DOM document */
148 public Document getDOM(File in, String encoding) {
149 try {
150
151 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
152 InputSource xml_source = new InputSource(isr);
153
154 this.parser.parse(xml_source);
155 Document doc = this.parser.getDocument();
156
157 return doc;
158
159 } catch (Exception e) {
160 logger.error(e.getMessage());
161 }
162 return null;
163 }
164
165
166 /** creates a new empty DOM Document */
167 public static Document newDOM() {
168 Document doc = new DocumentImpl();
169 return doc;
170 }
171
172 /**
173 * This method's parameters represent the parts of the Doctype of this
174 * Document that is to be created.
175 * For more info see
176 * http://xerces.apache.org/xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl.html#DocumentTypeImpl(org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
177 *
178 * */
179 public static Document newDOM(String qualifiedName, String publicID, String systemID) {
180 // create empty DOM document
181 DocumentImpl docImpl = new DocumentImpl();
182
183 // Need to use the document to create the docType for it
184 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
185
186 // Although we have created the docType using the document, we need to still
187 // put it into the empty document we just created
188 try{
189 docImpl.appendChild(myDocType);
190 } catch(Exception e) {
191 System.out.println("Could not append docType because: " + e) ;
192 }
193
194 // return the document containing a DocType
195 return docImpl;
196 }
197
198 /** returns the Node as a String */
199 public static String getString(Node xmlNode)
200 {
201 outputEscaping = true;
202 StringBuffer xmlRepresentation = new StringBuffer();
203 getString(xmlNode, xmlRepresentation, 0, false);
204 return xmlRepresentation.toString();
205 }
206
207 /** returns the node as a nicely formatted String - this introduces extra
208 * text nodes if the String is read back in as a DOM, so should only be
209 * used for printing */
210 public static String getPrettyString(Node xmlNode) {
211
212 outputEscaping = true;
213 StringBuffer xmlRepresentation = new StringBuffer();
214 getString(xmlNode, xmlRepresentation, 0, true);
215 return xmlRepresentation.toString();
216 }
217
218 private static void getString(Node xmlNode, StringBuffer xmlRepresentation,
219 int depth, boolean pretty)
220 {
221
222 if (xmlNode == null) {
223 xmlRepresentation.append("<null>");
224 return;
225 }
226
227 short nodeType = xmlNode.getNodeType();
228 String nodeName = xmlNode.getNodeName();
229
230 if (nodeType == Node.DOCUMENT_NODE) {
231 Document xmlDocNode = (Document)xmlNode;
232
233 //if (xmlDocNode.getDoctype() == null) {
234 //System.err.println("Doctype is null.");
235 //}
236 //else {
237 if (xmlDocNode.getDoctype() != null) {
238 DocumentType dt = xmlDocNode.getDoctype();
239
240 String name = dt.getName();
241 String pid = dt.getPublicId();
242 String sid = dt.getSystemId();
243
244 String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
245
246 xmlRepresentation.append(doctype_str);
247 }
248 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
249 return;
250 }
251 // Handle Element nodes
252 if (nodeType == Node.ELEMENT_NODE) {
253 if (pretty) {
254 xmlRepresentation.append("\n");
255 for (int i = 0; i < depth; i++) {
256 xmlRepresentation.append(" ");
257 }
258 }
259
260 // Write opening tag
261 xmlRepresentation.append("<");
262 xmlRepresentation.append(nodeName);
263
264 // Write the node attributes
265 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
266 for (int i = 0; i < nodeAttributes.getLength(); i++) {
267 Node attribute = nodeAttributes.item(i);
268 xmlRepresentation.append(" ");
269 xmlRepresentation.append(attribute.getNodeName());
270 xmlRepresentation.append("=\"");
271 xmlRepresentation.append(attribute.getNodeValue());
272 xmlRepresentation.append("\"");
273 }
274
275 // If the node has no children, close the opening tag and return
276 if (xmlNode.hasChildNodes() == false) {
277 // This produces somewhat ugly output, but it is necessary to compensate
278 // for display bugs in Netscape. Firstly, the space is needed before the
279 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
280 // example). Also, a newline character would be expected after the tag,
281 // but this causes problems with the display of links (the link text
282 // will contain a newline character, which is displayed badly).
283 xmlRepresentation.append(" />");
284 return;
285 }
286
287 // Close the opening tag
288 xmlRepresentation.append(">");
289
290 // Apply recursively to the children of this node
291 // hack for nodes next to text nodes - dont make them pretty
292 // this is needed for text inside a <pre> element - any new lines
293 // or spaces around the span elements show up in the text
294 NodeList children = xmlNode.getChildNodes();
295 boolean do_pretty = pretty;
296 for (int i = 0; i < children.getLength(); i++) {
297 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
298 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
299 }
300 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
301 }
302
303 // Write closing tag
304 if (pretty) {
305 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
306 for (int i = 0; i < depth; i++)
307 xmlRepresentation.append(" ");
308 }
309 }
310 xmlRepresentation.append("</");
311 xmlRepresentation.append(nodeName);
312 xmlRepresentation.append(">");
313 if (pretty) {
314 xmlRepresentation.append("\n");
315 }
316 }
317
318 // Handle Text nodes
319 else if (nodeType == Node.TEXT_NODE) {
320 String text = xmlNode.getNodeValue();
321
322 // Perform output escaping, if required
323 if (outputEscaping) {
324 text = text.replaceAll("&", "&amp;"); // Must be done first!!
325 text = text.replaceAll("<", "&lt;");
326 text = text.replaceAll(">", "&gt;");
327 text = text.replaceAll("\"", "&quot;");
328 text = text.replaceAll("\'", "&apos;");
329 }
330
331 // Remove any control-C characters
332 text = text.replaceAll("" + (char) 3, "");
333 xmlRepresentation.append(text);
334 }
335
336 // Handle Processing Instruction nodes
337 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
338 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
339 outputEscaping = false;
340 }
341 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
342 outputEscaping = true;
343 }
344 else {
345 logger.warn("Unhandled processing instruction " + nodeName);
346 }
347 }
348
349 else if (nodeType == Node.COMMENT_NODE) {
350 String text = xmlNode.getNodeValue();
351 xmlRepresentation.append("<!-- ");
352 xmlRepresentation.append(text);
353 xmlRepresentation.append(" -->");
354 }
355
356
357
358 // A type of node that is not handled yet
359 else {
360 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
361 }
362
363 return;
364 }
365
366 protected static String getNodeTypeString(short node_type) {
367
368 String type = "";
369 switch(node_type) {
370 case Node.ATTRIBUTE_NODE:
371 type="ATTRIBUTE_NODE";
372 break;
373 case Node.CDATA_SECTION_NODE:
374 type="CDATA_SECTION_NODE";
375 break;
376 case Node.COMMENT_NODE:
377 type="COMMENT_NODE";
378 break;
379 case Node.DOCUMENT_FRAGMENT_NODE:
380 type="DOCUMENT_FRAGMENT_NODE";
381 break;
382 case Node.DOCUMENT_NODE:
383 type="DOCUMENT_NODE";
384 break;
385 case Node.DOCUMENT_TYPE_NODE:
386 type="DOCUMENT_TYPE_NODE";
387 break;
388 case Node.ELEMENT_NODE:
389 type="ELEMENT_NODE";
390 break;
391 case Node.ENTITY_NODE:
392 type="ENTITY_NODE";
393 break;
394 case Node.ENTITY_REFERENCE_NODE:
395 type="ENTITY_REFERENCE_NODE";
396 break;
397 case Node.NOTATION_NODE:
398 type="NOTATION_NODE";
399 break;
400 case Node.PROCESSING_INSTRUCTION_NODE:
401 type="PROCESSING_INSTRUCTION_NODE";
402 break;
403 case Node.TEXT_NODE:
404 type="TEXT_NODE";
405 break;
406 default:
407 type="UNKNOWN";
408 }
409
410 return type;
411 }
412
413 // returns null if there no error occurred during parsing, or else returns the error message
414 public String getParseErrorMessage() {
415 ParseErrorHandler errorHandler = (ParseErrorHandler)this.parser.getErrorHandler();
416 return errorHandler.getErrorMessage();
417 }
418
419 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
420 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
421 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
422 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
423 static public class ParseErrorHandler implements ErrorHandler {
424 protected String errorMessage = null;
425
426 // Receive notification of a recoverable error.
427 public void error(SAXParseException exception) {
428 handleError("Error:\n", exception);
429 }
430 // Receive notification of a non-recoverable error.
431 public void fatalError(SAXParseException exception) {
432 handleError("Fatal Error:\n", exception);
433 }
434 // Receive notification of a warning.
435 public void warning(SAXParseException exception) {
436 handleError("Warning:\n", exception);
437 }
438
439 public String toString(SAXParseException e) {
440 String msg = e.getMessage();
441 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
442 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
443 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
444
445 return msg;
446 }
447
448 // clears the errorPage variable after first call to this method
449 public String getErrorMessage() {
450 String errMsg = this.errorMessage;
451 if(this.errorMessage != null) {
452 this.errorMessage = null;
453 }
454 return errMsg;
455 }
456
457 // sets the errorMessage member variable to the data stored in the exception
458 // and writes the errorMessage to the logger and tomcat's System.err
459 protected void handleError(String errorType, SAXParseException exception) {
460 this.errorMessage = errorType + toString(exception);
461 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
462 logger.error(this.errorMessage);
463 }
464 }
465}
Note: See TracBrowser for help on using the repository browser.