source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 29728

Last change on this file since 29728 was 29728, checked in by ak19, 9 years ago

Adjusting gs3-server code to work with changes made to 29686 where web.xml was split into web.xml and servlets.xml, with the first including the second.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.DOMImplementation;
23import org.w3c.dom.Document;
24import org.w3c.dom.DocumentType;
25import org.w3c.dom.Element;
26import org.w3c.dom.Node;
27import org.w3c.dom.NodeList;
28import org.w3c.dom.NamedNodeMap;
29import org.xml.sax.InputSource;
30import org.xml.sax.EntityResolver;
31import org.xml.sax.ErrorHandler;
32import org.xml.sax.SAXParseException;
33import org.xml.sax.SAXNotRecognizedException;
34import org.xml.sax.SAXNotSupportedException;
35import org.apache.xerces.parsers.DOMParser;
36import org.apache.xerces.dom.DocumentImpl; // for new Documents
37import org.apache.xerces.dom.DocumentTypeImpl;
38
39// other java classes
40import java.io.BufferedWriter;
41import java.io.ByteArrayInputStream;
42import java.io.FileWriter;
43import java.io.InputStream;
44import java.io.Reader;
45import java.io.InputStreamReader;
46import java.io.StringReader;
47import java.io.File;
48import java.io.FileInputStream;
49import java.io.FileReader;
50import java.util.regex.*;
51
52import org.apache.log4j.*;
53
54// Apache Commons
55import org.apache.commons.lang3.*;
56
57import java.util.*;
58import java.lang.reflect.*;
59
60/**
61 * XMLConverter - utility class for greenstone
62 *
63 * generates new Documents
64 * parses XML Strings into Documents, converts Nodes to Strings
65 * different parsers have different behaviour - can experiment in here
66 * at the moment we only use xerces
67 * all xerces specific code is in here
68 */
69public class XMLConverter
70{
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74 /** the no-args constructor */
75 public XMLConverter()
76 {
77
78 }
79
80 /** returns a DOM Document */
81 public static Document getDOM(String in)
82 {
83
84 try
85 {
86 Reader reader = new StringReader(in);
87 InputSource xml_source = new InputSource(reader);
88 Document doc = getDOM(xml_source, null);
89 reader.close();
90 return doc;
91
92 }
93 catch (Exception e)
94 {
95 logger.error(e.getMessage());
96 }
97 return null;
98 }
99
100 /** returns a DOM Document */
101 public static Document getDOM(String in, String encoding)
102 {
103 try
104 {
105 InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
106 InputSource xml_source = new InputSource(reader);
107 Document doc = getDOM(xml_source, null);
108 reader.close();
109 return doc;
110
111 }
112 catch (Exception e)
113 {
114 logger.error(e.getMessage());
115 }
116 return null;
117 }
118
119 /** returns a DOM Document */
120 public static Document getDOM(File in) {
121 try
122 {
123 FileReader reader = new FileReader(in);
124 InputSource xml_source = new InputSource(reader);
125 Document doc = getDOM(xml_source, null);
126 reader.close();
127 return doc;
128
129 }
130 catch (Exception e)
131 {
132 logger.error(e.getMessage(), e);
133
134 }
135 return null;
136 }
137
138 public static Document getDOM(File in, String encoding) {
139 return getDOM(in, encoding, null);
140 }
141
142 /** returns a DOM document */
143 public static Document getDOM(File in, String encoding, EntityResolver er) {
144
145 try {
146
147
148 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
149 InputSource xml_source = new InputSource(isr);
150 Document doc = getDOM(xml_source, er);
151 isr.close();
152 return doc;
153
154 }
155 catch (Exception e)
156 {
157 logger.error(e.getMessage());
158 }
159 return null;
160 }
161
162 public static Document getDOM(File in, EntityResolver er) {
163
164 try {
165 InputSource xml_source = new InputSource(new FileInputStream(in));
166 Document doc = getDOM(xml_source, er);
167 return doc;
168 }
169 catch (Exception e)
170 {
171 logger.error(e.getMessage());
172 }
173 return null;
174 }
175
176 public static Document getDOM(InputSource source, EntityResolver er) {
177
178 try {
179 DOMParser parser = new DOMParser();
180 parser.setFeature("http://xml.org/sax/features/validation", false);
181 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
182 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
183 // a performance test showed that having this on lead to increased
184 // memory use for small-medium docs, and not much gain for large
185 // docs.
186 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
187 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
188 // add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
189 parser.setErrorHandler(new ParseErrorHandler());
190 if (er != null) {
191 parser.setEntityResolver(er);
192 }
193 parser.parse(source);
194
195 Document doc = parser.getDocument();
196 return doc;
197
198 } catch (Exception e) {
199
200 logger.error(e.getMessage());
201 }
202 return null;
203
204 }
205
206 /** creates a new empty DOM Document */
207 public static Document newDOM()
208 {
209 Document doc = new DocumentImpl();
210 return doc;
211 }
212
213 /**
214 * This method's parameters represent the parts of the Doctype of this
215 * Document that is to be created. For more info see
216 * http://xerces.apache.org
217 * /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
218 * .html#DocumentTypeImpl
219 * (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
220 *
221 * */
222 public static Document newDOM(String qualifiedName, String publicID, String systemID)
223 {
224 // create empty DOM document
225 DocumentImpl docImpl = new DocumentImpl();
226
227 // Need to use the document to create the docType for it
228 DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
229
230 // Although we have created the docType using the document, we need to still
231 // put it into the empty document we just created
232 try
233 {
234 docImpl.appendChild(myDocType);
235 }
236 catch (Exception e)
237 {
238 System.out.println("Could not append docType because: " + e);
239 }
240
241 // return the document containing a DocType
242 return docImpl;
243 }
244
245 /** returns the Node as a String */
246 public static String getString(Node xmlNode)
247 {
248 StringBuffer xmlRepresentation = new StringBuffer();
249 getString(xmlNode, xmlRepresentation, 0, false);
250 return xmlRepresentation.toString();
251 }
252
253 /**
254 * returns the node as a nicely formatted String - this introduces extra
255 * text nodes if the String is read back in as a DOM, so should only be used
256 * for printing
257 */
258 public static String getPrettyString(Node xmlNode)
259 {
260 StringBuffer xmlRepresentation = new StringBuffer();
261 getString(xmlNode, xmlRepresentation, 0, true);
262 return xmlRepresentation.toString();
263 }
264
265 /*
266 * For the purposes of logger.debug statements, where this is called and
267 * hence outputted, returns an empty string if debugging is not enabled
268 */
269 public static String getPrettyStringLogger(Node xmlNode, Logger log)
270 {
271
272 if (log.isDebugEnabled())
273 return getPrettyString(xmlNode);
274
275 return "";
276
277 }
278
279 private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
280 {
281
282 if (xmlNode == null)
283 {
284 xmlRepresentation.append("<null>");
285 return;
286 }
287
288 short nodeType = xmlNode.getNodeType();
289 String nodeName = xmlNode.getNodeName();
290
291 if (nodeType == Node.DOCUMENT_NODE)
292 {
293 Document xmlDocNode = (Document) xmlNode;
294
295 //if (xmlDocNode.getDoctype() == null) {
296 //System.err.println("Doctype is null.");
297 //}
298 //else {
299 if (xmlDocNode.getDoctype() != null)
300 {
301 DocumentType dt = xmlDocNode.getDoctype();
302
303 String name = dt.getName();
304 String pid = dt.getPublicId();
305 String sid = dt.getSystemId();
306
307 // Use previously assigned name, not dt.getName() again
308 String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
309
310 xmlRepresentation.append(doctype_str);
311 }
312 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
313 return;
314 }
315 // Handle Element nodes
316 if (nodeType == Node.ELEMENT_NODE)
317 {
318 if (pretty)
319 {
320 xmlRepresentation.append("\n");
321 for (int i = 0; i < depth; i++)
322 {
323 xmlRepresentation.append(" ");
324 }
325 }
326
327 // Write opening tag
328 xmlRepresentation.append("<");
329 xmlRepresentation.append(nodeName);
330
331 // Write the node attributes
332 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
333 for (int i = 0; i < nodeAttributes.getLength(); i++)
334 {
335 Node attribute = nodeAttributes.item(i);
336 xmlRepresentation.append(" ");
337 xmlRepresentation.append(attribute.getNodeName());
338 xmlRepresentation.append("=\"");
339 xmlRepresentation.append(attribute.getNodeValue());
340 xmlRepresentation.append("\"");
341 }
342
343 // If the node has no children, close the opening tag and return
344 if (xmlNode.hasChildNodes() == false)
345 {
346 // This produces somewhat ugly output, but it is necessary to compensate
347 // for display bugs in Netscape. Firstly, the space is needed before the
348 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
349 // example). Also, a newline character would be expected after the tag,
350 // but this causes problems with the display of links (the link text
351 // will contain a newline character, which is displayed badly).
352 xmlRepresentation.append(" />");
353 return;
354 }
355
356 // Close the opening tag
357 xmlRepresentation.append(">");
358
359 // Process the children. We process text nodes here, but recursively process other nodes.
360 // hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
361 // Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
362 // also if these are inside a pre tag then the space shows up in the page.
363
364 NodeList children = xmlNode.getChildNodes();
365 boolean do_pretty = pretty;
366 boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
367 for (int i = 0; i < children.getLength(); i++)
368 {
369 Node child = children.item(i);
370 short child_type = child.getNodeType();
371 if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
372 if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
373 output_escaping = false;
374 }
375 else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
376 output_escaping = true;
377 }
378 else {
379 logger.warn("Unhandled processing instruction " + child.getNodeName());
380 }
381 }
382 else if (child_type == Node.TEXT_NODE) {
383 do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
384 // output the text
385 String text = child.getNodeValue();
386
387 // Perform output escaping, if required
388 // Apache Commons replace method is far superior to String.replaceAll - very fast!
389 if (output_escaping) {
390 text = StringUtils.replace(text, "&", "&amp;");
391 text = StringUtils.replace(text, "<", "&lt;");
392 text = StringUtils.replace(text, ">", "&gt;");
393 text = StringUtils.replace(text, "'", "&apos;");
394 text = StringUtils.replace(text, "\"", "&quot;");
395 }
396 // Remove any control-C characters
397 text = StringUtils.replace(text, "" + (char) 3, "");
398
399 xmlRepresentation.append(text);
400
401 }
402 else {
403 // recursively call getString
404 getString(child, xmlRepresentation, depth + 1, do_pretty);
405 }
406 } // foreach child of the element
407
408 // Write closing tag
409 if (pretty)
410 {
411 if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
412 {
413 for (int i = 0; i < depth; i++)
414 xmlRepresentation.append(" ");
415 }
416 }
417 xmlRepresentation.append("</");
418 xmlRepresentation.append(nodeName);
419 xmlRepresentation.append(">");
420 if (pretty)
421 {
422 xmlRepresentation.append("\n");
423 }
424 } // ELEMENT_NODE
425
426 else if (nodeType == Node.COMMENT_NODE)
427 {
428 String text = xmlNode.getNodeValue();
429 xmlRepresentation.append("<!-- ");
430 xmlRepresentation.append(text);
431 xmlRepresentation.append(" -->");
432 }
433
434 // TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
435 // A type of node that is not handled yet
436 else
437 {
438 logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
439 }
440
441 return;
442 }
443
444 protected static String getNodeTypeString(short node_type)
445 {
446
447 String type = "";
448 switch (node_type)
449 {
450 case Node.ATTRIBUTE_NODE:
451 type = "ATTRIBUTE_NODE";
452 break;
453 case Node.CDATA_SECTION_NODE:
454 type = "CDATA_SECTION_NODE";
455 break;
456 case Node.COMMENT_NODE:
457 type = "COMMENT_NODE";
458 break;
459 case Node.DOCUMENT_FRAGMENT_NODE:
460 type = "DOCUMENT_FRAGMENT_NODE";
461 break;
462 case Node.DOCUMENT_NODE:
463 type = "DOCUMENT_NODE";
464 break;
465 case Node.DOCUMENT_TYPE_NODE:
466 type = "DOCUMENT_TYPE_NODE";
467 break;
468 case Node.ELEMENT_NODE:
469 type = "ELEMENT_NODE";
470 break;
471 case Node.ENTITY_NODE:
472 type = "ENTITY_NODE";
473 break;
474 case Node.ENTITY_REFERENCE_NODE:
475 type = "ENTITY_REFERENCE_NODE";
476 break;
477 case Node.NOTATION_NODE:
478 type = "NOTATION_NODE";
479 break;
480 case Node.PROCESSING_INSTRUCTION_NODE:
481 type = "PROCESSING_INSTRUCTION_NODE";
482 break;
483 case Node.TEXT_NODE:
484 type = "TEXT_NODE";
485 break;
486 default:
487 type = "UNKNOWN";
488 }
489
490 return type;
491 }
492
493 // returns null if there no error occurred during parsing, or else returns the error message
494
495 // public String getParseErrorMessage()
496 // {
497 // ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
498 // return errorHandler.getErrorMessage();
499 // }
500
501 // Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
502 // register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
503 // errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
504 // is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
505 static public class ParseErrorHandler implements ErrorHandler
506 {
507 protected String errorMessage = null;
508
509 // Receive notification of a recoverable error.
510 public void error(SAXParseException exception)
511 {
512 handleError("Error:\n", exception);
513 }
514
515 // Receive notification of a non-recoverable error.
516 public void fatalError(SAXParseException exception)
517 {
518 handleError("Fatal Error:\n", exception);
519 }
520
521 // Receive notification of a warning.
522 public void warning(SAXParseException exception)
523 {
524 handleError("Warning:\n", exception);
525 }
526
527 public String toString(SAXParseException e)
528 {
529 String msg = e.getMessage();
530 msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
531 msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
532 msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
533
534 return msg;
535 }
536
537 // clears the errorPage variable after first call to this method
538 public String getErrorMessage()
539 {
540 String errMsg = this.errorMessage;
541 if (this.errorMessage != null)
542 {
543 this.errorMessage = null;
544 }
545 return errMsg;
546 }
547
548 // sets the errorMessage member variable to the data stored in the exception
549 // and writes the errorMessage to the logger and tomcat's System.err
550 protected void handleError(String errorType, SAXParseException exception)
551 {
552 this.errorMessage = errorType + toString(exception);
553 System.err.println("\n****Error parsing xml:\n" + this.errorMessage + "\n****\n");
554 logger.error(this.errorMessage);
555 }
556 }
557
558 public static boolean writeDOM(Element elem, File file) {
559
560 BufferedWriter writer = null;
561 boolean success = false;
562 try {
563 String xml_string = getString(elem);
564 // need createNewFile???
565 writer = new BufferedWriter(new FileWriter(file));
566 writer.write(xml_string);
567 success = true;
568 }
569
570 catch (Exception e) {
571 logger.error(e.getMessage());
572 success = false;
573 }
574 finally {
575 try {
576 if (writer != null) {
577 writer.close();
578 }
579 } catch(Exception e) {
580 logger.error("couldn't close the file"+e.getMessage());
581 }
582 }
583 return success;
584 }
585}
Note: See TracBrowser for help on using the repository browser.