source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28978

Last change on this file since 28978 was 28978, checked in by kjdon, 10 years ago

removed commented out code. removed static variable outputEscaping. what will happen if two threads are doing pretty printing???. Now we process text and processing instruction nodes in a line, not recursively, so can use the processing instructions when processing the text nods in the for loop.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.xml.sax.SAXNotRecognizedException;
34import org.xml.sax.SAXNotSupportedException;
35import org.apache.xerces.parsers.DOMParser;
36import org.apache.xerces.dom.DocumentImpl; // for new Documents
37import org.apache.xerces.dom.DocumentTypeImpl;
38
39// other java classes
40import java.io.BufferedWriter;
41import java.io.ByteArrayInputStream;
42import java.io.FileWriter;
43import java.io.InputStream;
44import java.io.Reader;
45import java.io.InputStreamReader;
46import java.io.StringReader;
47import java.io.File;
48import java.io.FileInputStream;
49import java.io.FileReader;
50import java.util.regex.*;
51
52import org.apache.log4j.*;
53
54// Apache Commons
55import org.apache.commons.lang3.*;
56
57import java.util.*;
58import java.lang.reflect.*;
59
60/**
61 * XMLConverter - utility class for greenstone
62 *
63 * generates new Documents
64 * parses XML Strings into Documents, converts Nodes to Strings
65 * different parsers have different behaviour - can experiment in here
66 * at the moment we only use xerces
67 * all xerces specific code is in here
68 */
69public class XMLConverter
70{
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74 /** the no-args constructor */
75 public XMLConverter()
76 {
77
78 }
79
80 /** returns a DOM Document */
81 public static Document getDOM(String in)
82 {
83
84 try
85 {
86 Reader reader = new StringReader(in);
87 InputSource xml_source = new InputSource(reader);
88 Document doc = getDOM(xml_source, null);
89 reader.close();
90 return doc;
91
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 }
97 return null;
98 }
99
100 /** returns a DOM Document */
101 public static Document getDOM(String in, String encoding)
102 {
103 try
104 {
105 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
106 InputSource xml_source = new InputSource(reader);
107 Document doc = getDOM(xml_source, null);
108 reader.close();
109 return doc;
110
111 }
112 catch (Exception e)
113 {
114 logger.error(e.getMessage());
115 }
116 return null;
117 }
118
119 /** returns a DOM Document */
120 public static Document getDOM(File in) {
121 try
122 {
123 FileReader reader = new FileReader(in);
124 InputSource xml_source = new InputSource(reader);
125 Document doc = getDOM(xml_source, null);
126 reader.close();
127 return doc;
128
129 }
130 catch (Exception e)
131 {
132 logger.error(e.getMessage(), e);
133
134 }
135 return null;
136 }
137
138 public static Document getDOM(File in, String encoding) {
139 return getDOM(in, encoding, null);
140 }
141
142 /** returns a DOM document */
143 public static Document getDOM(File in, String encoding, EntityResolver er) {
144
145 try {
146
147
148 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
149 InputSource xml_source = new InputSource(isr);
150 Document doc = getDOM(xml_source, er);
151 isr.close();
152 return doc;
153
154 }
155 catch (Exception e)
156 {
157 logger.error(e.getMessage());
158 }
159 return null;
160 }
161
162 public static Document getDOM(InputSource source, EntityResolver er) {
163
164 try {
165 DOMParser parser = new DOMParser();
166 parser.setFeature("http://xml.org/sax/features/validation", false);
167 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
168 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
169 // a performance test showed that having this on lead to increased
170 // memory use for small-medium docs, and not much gain for large
171 // docs.
172 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
173 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
174 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
175 parser.setErrorHandler(new ParseErrorHandler());
176 if (er != null) {
177 parser.setEntityResolver(er);
178 }
179 parser.parse(source);
180
181 Document doc = parser.getDocument();
182 return doc;
183
184 } catch (Exception e) {
185
186 logger.error(e.getMessage());
187 }
188 return null;
189
190 }
191
192 /** creates a new empty DOM Document */
193 public static Document newDOM()
194 {
195 Document doc = new DocumentImpl();
196 return doc;
197 }
198
199 /**
200 * This method's parameters represent the parts of the Doctype of this
201 * Document that is to be created. For more info see
202 * http://xerces.apache.org
203 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
204 * .html#DocumentTypeImpl
205 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
206 *
207 * */
208 public static Document newDOM(String qualifiedName, String publicID, String systemID)
209 {
210 // create empty DOM document
211 DocumentImpl docImpl = new DocumentImpl();
212
213 // Need to use the document to create the docType for it
214 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
215
216 // Although we have created the docType using the document, we need to still
217 // put it into the empty document we just created
218 try
219 {
220 docImpl.appendChild(myDocType);
221 }
222 catch (Exception e)
223 {
224 System.out.println("Could not append docType because: " + e);
225 }
226
227 // return the document containing a DocType
228 return docImpl;
229 }
230
231 /** returns the Node as a String */
232 public static String getString(Node xmlNode)
233 {
234 StringBuffer xmlRepresentation = new StringBuffer();
235 getString(xmlNode, xmlRepresentation, 0, false);
236 return xmlRepresentation.toString();
237 }
238
239 /**
240 * returns the node as a nicely formatted String - this introduces extra
241 * text nodes if the String is read back in as a DOM, so should only be used
242 * for printing
243 */
244 public static String getPrettyString(Node xmlNode)
245 {
246 StringBuffer xmlRepresentation = new StringBuffer();
247 getString(xmlNode, xmlRepresentation, 0, true);
248 return xmlRepresentation.toString();
249 }
250
251 /*
252 * For the purposes of logger.debug statements, where this is called and
253 * hence outputted, returns an empty string if debugging is not enabled
254 */
255 public static String getPrettyStringLogger(Node xmlNode, Logger log)
256 {
257
258 if (log.isDebugEnabled())
259 return getPrettyString(xmlNode);
260
261 return "";
262
263 }
264
265 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
266 {
267
268 if (xmlNode == null)
269 {
270 xmlRepresentation.append("<null>");
271 return;
272 }
273
274 short nodeType = xmlNode.getNodeType();
275 String nodeName = xmlNode.getNodeName();
276
277 if (nodeType == Node.DOCUMENT_NODE)
278 {
279 Document xmlDocNode = (Document) xmlNode;
280
281 //if (xmlDocNode.getDoctype() == null) {
282 //System.err.println("Doctype is null.");
283 //}
284 //else {
285 if (xmlDocNode.getDoctype() != null)
286 {
287 DocumentType dt = xmlDocNode.getDoctype();
288
289 String name = dt.getName();
290 String pid = dt.getPublicId();
291 String sid = dt.getSystemId();
292
293 // Use previously assigned name, not dt.getName() again
294 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
295
296 xmlRepresentation.append(doctype_str);
297 }
298 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
299 return;
300 }
301 // Handle Element nodes
302 if (nodeType == Node.ELEMENT_NODE)
303 {
304 if (pretty)
305 {
306 xmlRepresentation.append("\n");
307 for (int i = 0; i < depth; i++)
308 {
309 xmlRepresentation.append(" ");
310 }
311 }
312
313 // Write opening tag
314 xmlRepresentation.append("<");
315 xmlRepresentation.append(nodeName);
316
317 // Write the node attributes
318 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
319 for (int i = 0; i < nodeAttributes.getLength(); i++)
320 {
321 Node attribute = nodeAttributes.item(i);
322 xmlRepresentation.append(" ");
323 xmlRepresentation.append(attribute.getNodeName());
324 xmlRepresentation.append("=\"");
325 xmlRepresentation.append(attribute.getNodeValue());
326 xmlRepresentation.append("\"");
327 }
328
329 // If the node has no children, close the opening tag and return
330 if (xmlNode.hasChildNodes() == false)
331 {
332 // This produces somewhat ugly output, but it is necessary to compensate
333 // for display bugs in Netscape. Firstly, the space is needed before the
334 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
335 // example). Also, a newline character would be expected after the tag,
336 // but this causes problems with the display of links (the link text
337 // will contain a newline character, which is displayed badly).
338 xmlRepresentation.append(" />");
339 return;
340 }
341
342 // Close the opening tag
343 xmlRepresentation.append(">");
344
345 // Process the children. We process text nodes here, but recursively process other nodes.
346 // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
347 // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
348 // also if these are inside a pre tag then the space shows up in the page.
349
350 NodeList children = xmlNode.getChildNodes();
351 boolean do_pretty = pretty;
352 boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
353 for (int i = 0; i < children.getLength(); i++)
354 {
355 Node child = children.item(i);
356 short child_type = child.getNodeType();
357 if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
358 if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
359 output_escaping = false;
360 }
361 else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
362 output_escaping = true;
363 }
364 else {
365 logger.warn("Unhandled processing instruction " + child.getNodeName());
366 }
367 }
368 else if (child_type == Node.TEXT_NODE) {
369 do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
370 // output the text
371 String text = child.getNodeValue();
372
373 // Perform output escaping, if required
374 // Apache Commons replace method is far superior to String.replaceAll - very fast!
375 if (output_escaping) {
376 text = StringUtils.replace(text, "&", "&amp;");
377 text = StringUtils.replace(text, "<", "&lt;");
378 text = StringUtils.replace(text, ">", "&gt;");
379 text = StringUtils.replace(text, "'", "&apos;");
380 text = StringUtils.replace(text, "\"", "&quot;");
381 }
382 // Remove any control-C characters
383 text = StringUtils.replace(text, "" + (char) 3, "");
384
385 xmlRepresentation.append(text);
386
387 }
388 else {
389 // recursively call getString
390 getString(child, xmlRepresentation, depth + 1, do_pretty);
391 }
392 } // foreach child of the element
393
394 // Write closing tag
395 if (pretty)
396 {
397 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
398 {
399 for (int i = 0; i < depth; i++)
400 xmlRepresentation.append(" ");
401 }
402 }
403 xmlRepresentation.append("</");
404 xmlRepresentation.append(nodeName);
405 xmlRepresentation.append(">");
406 if (pretty)
407 {
408 xmlRepresentation.append("\n");
409 }
410 } // ELEMENT_NODE
411
412 else if (nodeType == Node.COMMENT_NODE)
413 {
414 String text = xmlNode.getNodeValue();
415 xmlRepresentation.append("<!-- ");
416 xmlRepresentation.append(text);
417 xmlRepresentation.append(" -->");
418 }
419
420 // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
421 // A type of node that is not handled yet
422 else
423 {
424 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
425 }
426
427 return;
428 }
429
430 protected static String getNodeTypeString(short node_type)
431 {
432
433 String type = "";
434 switch (node_type)
435 {
436 case Node.ATTRIBUTE_NODE:
437 type = "ATTRIBUTE_NODE";
438 break;
439 case Node.CDATA_SECTION_NODE:
440 type = "CDATA_SECTION_NODE";
441 break;
442 case Node.COMMENT_NODE:
443 type = "COMMENT_NODE";
444 break;
445 case Node.DOCUMENT_FRAGMENT_NODE:
446 type = "DOCUMENT_FRAGMENT_NODE";
447 break;
448 case Node.DOCUMENT_NODE:
449 type = "DOCUMENT_NODE";
450 break;
451 case Node.DOCUMENT_TYPE_NODE:
452 type = "DOCUMENT_TYPE_NODE";
453 break;
454 case Node.ELEMENT_NODE:
455 type = "ELEMENT_NODE";
456 break;
457 case Node.ENTITY_NODE:
458 type = "ENTITY_NODE";
459 break;
460 case Node.ENTITY_REFERENCE_NODE:
461 type = "ENTITY_REFERENCE_NODE";
462 break;
463 case Node.NOTATION_NODE:
464 type = "NOTATION_NODE";
465 break;
466 case Node.PROCESSING_INSTRUCTION_NODE:
467 type = "PROCESSING_INSTRUCTION_NODE";
468 break;
469 case Node.TEXT_NODE:
470 type = "TEXT_NODE";
471 break;
472 default:
473 type = "UNKNOWN";
474 }
475
476 return type;
477 }
478
479 // returns null if there no error occurred during parsing, or else returns the error message
480
481 // public String getParseErrorMessage()
482 // {
483 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
484 // return errorHandler.getErrorMessage();
485 // }
486
487 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
488 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
489 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
490 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
491 static public class ParseErrorHandler implements ErrorHandler
492 {
493 protected String errorMessage = null;
494
495 // Receive notification of a recoverable error.
496 public void error(SAXParseException exception)
497 {
498 handleError("Error:\n", exception);
499 }
500
501 // Receive notification of a non-recoverable error.
502 public void fatalError(SAXParseException exception)
503 {
504 handleError("Fatal Error:\n", exception);
505 }
506
507 // Receive notification of a warning.
508 public void warning(SAXParseException exception)
509 {
510 handleError("Warning:\n", exception);
511 }
512
513 public String toString(SAXParseException e)
514 {
515 String msg = e.getMessage();
516 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
517 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
518 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
519
520 return msg;
521 }
522
523 // clears the errorPage variable after first call to this method
524 public String getErrorMessage()
525 {
526 String errMsg = this.errorMessage;
527 if (this.errorMessage != null)
528 {
529 this.errorMessage = null;
530 }
531 return errMsg;
532 }
533
534 // sets the errorMessage member variable to the data stored in the exception
535 // and writes the errorMessage to the logger and tomcat's System.err
536 protected void handleError(String errorType, SAXParseException exception)
537 {
538 this.errorMessage = errorType + toString(exception);
539 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
540 logger.error(this.errorMessage);
541 }
542 }
543
544 public static boolean writeDOM(Element elem, File file) {
545
546 BufferedWriter writer = null;
547 boolean success = false;
548 try {
549 String xml_string = getString(elem);
550 // need createNewFile???
551 writer = new BufferedWriter(new FileWriter(file));
552 writer.write(xml_string);
553 success = true;
554 }
555
556 catch (Exception e) {
557 logger.error(e.getMessage());
558 success = false;
559 }
560 finally {
561 try {
562 if (writer != null) {
563 writer.close();
564 }
565 } catch(Exception e) {
566 logger.error("couldn't close the file"+e.getMessage());
567 }
568 }
569 return success;
570 }
571}
Note: See TracBrowser for help on using the repository browser.