source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28962

Last change on this file since 28962 was 28962, checked in by kjdon, 10 years ago

getDOM was already creating a new parser each time, so we don't need one in the constructor. now that we are basically doing nothing in the constructor, I have made all the methods static, and newDOM is always called statically. getDOM and others may now also be called statically. I might remove all this.converter and just use this statically, but haven't done that yet. have consolidated getDOM methods for maximum code reuse. Only the file + encoding version was actually using an EntityResolver in the code, so that is the only one I have added that as a param too. Maybe in future need other getDOM methods with EntityResolver param. nodeToElement moved to GSXML as it doesn't need to be here.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.xml.sax.SAXNotRecognizedException;
34import org.xml.sax.SAXNotSupportedException;
35import org.apache.xerces.parsers.DOMParser;
36import org.apache.xerces.dom.DocumentImpl; // for new Documents
37import org.apache.xerces.dom.DocumentTypeImpl;
38
39// other java classes
40import java.io.BufferedWriter;
41import java.io.ByteArrayInputStream;
42import java.io.FileWriter;
43import java.io.InputStream;
44import java.io.Reader;
45import java.io.InputStreamReader;
46import java.io.StringReader;
47import java.io.File;
48import java.io.FileInputStream;
49import java.io.FileReader;
50import java.util.regex.*;
51
52import org.apache.log4j.*;
53
54// Apache Commons
55import org.apache.commons.lang3.*;
56
57import java.util.*;
58import java.lang.reflect.*;
59
60/**
61 * XMLConverter - utility class for greenstone
62 *
63 * generates new Documents
64 * parses XML Strings into Documents, converts Nodes to Strings
65 * different parsers have different behaviour - can experiment in here
66 * at the moment we only use xerces
67 * all xerces specific code is in here
68 */
69public class XMLConverter
70{
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74 //protected EntityResolver resolver = null;
75
76 private static boolean outputEscaping = true;
77
78 /** the no-args constructor */
79 public XMLConverter()
80 {
81
82 }
83
84 // /** sets the entity resolver. pass in null to unset it */
85 // public void setEntityResolver(EntityResolver er)
86 // {
87 // this.resolver = er;
88 // }
89
90 /** returns a DOM Document */
91 public static Document getDOM(String in)
92 {
93
94 try
95 {
96 Reader reader = new StringReader(in);
97 InputSource xml_source = new InputSource(reader);
98 Document doc = getDOM(xml_source, null);
99 reader.close();
100 return doc;
101
102 }
103 catch (Exception e)
104 {
105 logger.error(e.getMessage());
106 }
107 return null;
108 }
109
110 /** returns a DOM Document */
111 public static Document getDOM(String in, String encoding)
112 {
113 try
114 {
115 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
116 InputSource xml_source = new InputSource(reader);
117 Document doc = getDOM(xml_source, null);
118 reader.close();
119 return doc;
120
121 }
122 catch (Exception e)
123 {
124 logger.error(e.getMessage());
125 }
126 return null;
127 }
128
129 /** returns a DOM Document */
130 public static Document getDOM(File in) {
131 try
132 {
133 FileReader reader = new FileReader(in);
134 InputSource xml_source = new InputSource(reader);
135 Document doc = getDOM(xml_source, null);
136 reader.close();
137 return doc;
138
139 }
140 catch (Exception e)
141 {
142 logger.error(e.getMessage(), e);
143
144 }
145 return null;
146 }
147
148 public static Document getDOM(File in, String encoding) {
149 return getDOM(in, encoding, null);
150 }
151
152 /** returns a DOM document */
153 public static Document getDOM(File in, String encoding, EntityResolver er) {
154
155 try {
156
157
158 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
159 InputSource xml_source = new InputSource(isr);
160 Document doc = getDOM(xml_source, er);
161 isr.close();
162 return doc;
163
164 }
165 catch (Exception e)
166 {
167 logger.error(e.getMessage());
168 }
169 return null;
170 }
171
172 public static Document getDOM(InputSource source, EntityResolver er) {
173
174 try {
175 DOMParser parser = new DOMParser();
176 parser.setFeature("http://xml.org/sax/features/validation", false);
177 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
178 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
179 // a performance test showed that having this on lead to increased
180 // memory use for small-medium docs, and not much gain for large
181 // docs.
182 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
183 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
184 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
185 parser.setErrorHandler(new ParseErrorHandler());
186 if (er != null) {
187 parser.setEntityResolver(er);
188 }
189 parser.parse(source);
190
191 Document doc = parser.getDocument();
192 return doc;
193
194 } catch (Exception e) {
195
196 logger.error(e.getMessage());
197 }
198 return null;
199
200 }
201
202 /** creates a new empty DOM Document */
203 public static Document newDOM()
204 {
205 Document doc = new DocumentImpl();
206 return doc;
207 }
208
209 /**
210 * This method's parameters represent the parts of the Doctype of this
211 * Document that is to be created. For more info see
212 * http://xerces.apache.org
213 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
214 * .html#DocumentTypeImpl
215 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
216 *
217 * */
218 public static Document newDOM(String qualifiedName, String publicID, String systemID)
219 {
220 // create empty DOM document
221 DocumentImpl docImpl = new DocumentImpl();
222
223 // Need to use the document to create the docType for it
224 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
225
226 // Although we have created the docType using the document, we need to still
227 // put it into the empty document we just created
228 try
229 {
230 docImpl.appendChild(myDocType);
231 }
232 catch (Exception e)
233 {
234 System.out.println("Could not append docType because: " + e);
235 }
236
237 // return the document containing a DocType
238 return docImpl;
239 }
240
241 /** returns the Node as a String */
242 public static String getString(Node xmlNode)
243 {
244 outputEscaping = true;
245 StringBuffer xmlRepresentation = new StringBuffer();
246 getString(xmlNode, xmlRepresentation, 0, false);
247 return xmlRepresentation.toString();
248 }
249
250 /**
251 * returns the node as a nicely formatted String - this introduces extra
252 * text nodes if the String is read back in as a DOM, so should only be used
253 * for printing
254 */
255 public static String getPrettyString(Node xmlNode)
256 {
257
258 outputEscaping = true;
259 StringBuffer xmlRepresentation = new StringBuffer();
260 getString(xmlNode, xmlRepresentation, 0, true);
261 return xmlRepresentation.toString();
262 }
263
264 /*
265 * For the purposes of logger.debug statements, where this is called and
266 * hence outputted, returns an empty string if debugging is not enabled
267 */
268 public static String getPrettyStringLogger(Node xmlNode, Logger log)
269 {
270
271 if (log.isDebugEnabled())
272 return getPrettyString(xmlNode);
273
274 return "";
275
276 }
277
278 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
279 {
280
281 if (xmlNode == null)
282 {
283 xmlRepresentation.append("<null>");
284 return;
285 }
286
287 short nodeType = xmlNode.getNodeType();
288 String nodeName = xmlNode.getNodeName();
289
290 if (nodeType == Node.DOCUMENT_NODE)
291 {
292 Document xmlDocNode = (Document) xmlNode;
293
294 //if (xmlDocNode.getDoctype() == null) {
295 //System.err.println("Doctype is null.");
296 //}
297 //else {
298 if (xmlDocNode.getDoctype() != null)
299 {
300 DocumentType dt = xmlDocNode.getDoctype();
301
302 String name = dt.getName();
303 String pid = dt.getPublicId();
304 String sid = dt.getSystemId();
305
306 // Use previously assigned name, not dt.getName() again
307 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
308
309 xmlRepresentation.append(doctype_str);
310 }
311 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
312 return;
313 }
314 // Handle Element nodes
315 if (nodeType == Node.ELEMENT_NODE)
316 {
317 if (pretty)
318 {
319 xmlRepresentation.append("\n");
320 for (int i = 0; i < depth; i++)
321 {
322 xmlRepresentation.append(" ");
323 }
324 }
325
326 // Write opening tag
327 xmlRepresentation.append("<");
328 xmlRepresentation.append(nodeName);
329
330 // Write the node attributes
331 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
332 for (int i = 0; i < nodeAttributes.getLength(); i++)
333 {
334 Node attribute = nodeAttributes.item(i);
335 xmlRepresentation.append(" ");
336 xmlRepresentation.append(attribute.getNodeName());
337 xmlRepresentation.append("=\"");
338 xmlRepresentation.append(attribute.getNodeValue());
339 xmlRepresentation.append("\"");
340 }
341
342 // If the node has no children, close the opening tag and return
343 if (xmlNode.hasChildNodes() == false)
344 {
345 // This produces somewhat ugly output, but it is necessary to compensate
346 // for display bugs in Netscape. Firstly, the space is needed before the
347 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
348 // example). Also, a newline character would be expected after the tag,
349 // but this causes problems with the display of links (the link text
350 // will contain a newline character, which is displayed badly).
351 xmlRepresentation.append(" />");
352 return;
353 }
354
355 // Close the opening tag
356 xmlRepresentation.append(">");
357
358 // Apply recursively to the children of this node
359 // hack for nodes next to text nodes - dont make them pretty
360 // this is needed for text inside a <pre> element - any new lines
361 // or spaces around the span elements show up in the text
362 NodeList children = xmlNode.getChildNodes();
363 boolean do_pretty = pretty;
364 for (int i = 0; i < children.getLength(); i++)
365 {
366 if (children.item(i).getNodeType() == Node.TEXT_NODE)
367 {
368 do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
369 }
370 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
371 }
372
373 // Write closing tag
374 if (pretty)
375 {
376 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
377 {
378 for (int i = 0; i < depth; i++)
379 xmlRepresentation.append(" ");
380 }
381 }
382 xmlRepresentation.append("</");
383 xmlRepresentation.append(nodeName);
384 xmlRepresentation.append(">");
385 if (pretty)
386 {
387 xmlRepresentation.append("\n");
388 }
389 }
390
391 // Handle Text nodes
392 else if (nodeType == Node.TEXT_NODE)
393 {
394 String text = xmlNode.getNodeValue();
395
396 // Perform output escaping, if required
397 // Apache Commons replace method is far superior to String.replaceAll - very fast!
398 if (outputEscaping)
399 {
400
401 text = StringUtils.replace(text, "&", "&amp;");
402 text = StringUtils.replace(text, "<", "&lt;");
403 text = StringUtils.replace(text, ">", "&gt;");
404 text = StringUtils.replace(text, "'", "&apos;");
405 text = StringUtils.replace(text, "\"", "&quot;");
406 }
407
408 // Remove any control-C characters
409 text = StringUtils.replace(text, "" + (char) 3, "");
410
411 xmlRepresentation.append(text);
412 }
413
414 // Handle Processing Instruction nodes
415 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
416 {
417 if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
418 {
419 outputEscaping = false;
420 }
421 else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
422 {
423 outputEscaping = true;
424 }
425 else
426 {
427 logger.warn("Unhandled processing instruction " + nodeName);
428 }
429 }
430
431 else if (nodeType == Node.COMMENT_NODE)
432 {
433 String text = xmlNode.getNodeValue();
434 xmlRepresentation.append("<!-- ");
435 xmlRepresentation.append(text);
436 xmlRepresentation.append(" -->");
437 }
438
439 // A type of node that is not handled yet
440 else
441 {
442 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
443 }
444
445 return;
446 }
447
448 protected static String getNodeTypeString(short node_type)
449 {
450
451 String type = "";
452 switch (node_type)
453 {
454 case Node.ATTRIBUTE_NODE:
455 type = "ATTRIBUTE_NODE";
456 break;
457 case Node.CDATA_SECTION_NODE:
458 type = "CDATA_SECTION_NODE";
459 break;
460 case Node.COMMENT_NODE:
461 type = "COMMENT_NODE";
462 break;
463 case Node.DOCUMENT_FRAGMENT_NODE:
464 type = "DOCUMENT_FRAGMENT_NODE";
465 break;
466 case Node.DOCUMENT_NODE:
467 type = "DOCUMENT_NODE";
468 break;
469 case Node.DOCUMENT_TYPE_NODE:
470 type = "DOCUMENT_TYPE_NODE";
471 break;
472 case Node.ELEMENT_NODE:
473 type = "ELEMENT_NODE";
474 break;
475 case Node.ENTITY_NODE:
476 type = "ENTITY_NODE";
477 break;
478 case Node.ENTITY_REFERENCE_NODE:
479 type = "ENTITY_REFERENCE_NODE";
480 break;
481 case Node.NOTATION_NODE:
482 type = "NOTATION_NODE";
483 break;
484 case Node.PROCESSING_INSTRUCTION_NODE:
485 type = "PROCESSING_INSTRUCTION_NODE";
486 break;
487 case Node.TEXT_NODE:
488 type = "TEXT_NODE";
489 break;
490 default:
491 type = "UNKNOWN";
492 }
493
494 return type;
495 }
496
497 // returns null if there no error occurred during parsing, or else returns the error message
498
499 // public String getParseErrorMessage()
500 // {
501 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
502 // return errorHandler.getErrorMessage();
503 // }
504
505 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
506 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
507 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
508 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
509 static public class ParseErrorHandler implements ErrorHandler
510 {
511 protected String errorMessage = null;
512
513 // Receive notification of a recoverable error.
514 public void error(SAXParseException exception)
515 {
516 handleError("Error:\n", exception);
517 }
518
519 // Receive notification of a non-recoverable error.
520 public void fatalError(SAXParseException exception)
521 {
522 handleError("Fatal Error:\n", exception);
523 }
524
525 // Receive notification of a warning.
526 public void warning(SAXParseException exception)
527 {
528 handleError("Warning:\n", exception);
529 }
530
531 public String toString(SAXParseException e)
532 {
533 String msg = e.getMessage();
534 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
535 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
536 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
537
538 return msg;
539 }
540
541 // clears the errorPage variable after first call to this method
542 public String getErrorMessage()
543 {
544 String errMsg = this.errorMessage;
545 if (this.errorMessage != null)
546 {
547 this.errorMessage = null;
548 }
549 return errMsg;
550 }
551
552 // sets the errorMessage member variable to the data stored in the exception
553 // and writes the errorMessage to the logger and tomcat's System.err
554 protected void handleError(String errorType, SAXParseException exception)
555 {
556 this.errorMessage = errorType + toString(exception);
557 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
558 logger.error(this.errorMessage);
559 }
560 }
561
562 public static boolean writeDOM(Element elem, File file) {
563
564 BufferedWriter writer = null;
565 boolean success = false;
566 try {
567 String xml_string = getString(elem);
568 // need createNewFile???
569 writer = new BufferedWriter(new FileWriter(file));
570 writer.write(xml_string);
571 success = true;
572 }
573
574 catch (Exception e) {
575 logger.error(e.getMessage());
576 success = false;
577 }
578 finally {
579 try {
580 if (writer != null) {
581 writer.close();
582 }
583 } catch(Exception e) {
584 logger.error("couldn't close the file"+e.getMessage());
585 }
586 }
587 return success;
588 }
589}
Note: See TracBrowser for help on using the repository browser.