Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28962

Last change on this file since 28962 was 28962, checked in by kjdon, 10 years ago
getDOM was already creating a new parser each time, so we don't need one in the constructor. now that we are basically doing nothing in the constructor, I have made all the methods static, and newDOM is always called statically. getDOM and others may now also be called statically. I might remove all this.converter and just use this statically, but haven't done that yet. have consolidated getDOM methods for maximum code reuse. Only the file + encoding version was actually using an EntityResolver in the code, so that is the only one I have added that as a param too. Maybe in future need other getDOM methods with EntityResolver param. nodeToElement moved to GSXML as it doesn't need to be here.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.xml.sax.SAXNotRecognizedException;
34	import org.xml.sax.SAXNotSupportedException;
35	import org.apache.xerces.parsers.DOMParser;
36	import org.apache.xerces.dom.DocumentImpl; // for new Documents
37	import org.apache.xerces.dom.DocumentTypeImpl;
38
39	// other java classes
40	import java.io.BufferedWriter;
41	import java.io.ByteArrayInputStream;
42	import java.io.FileWriter;
43	import java.io.InputStream;
44	import java.io.Reader;
45	import java.io.InputStreamReader;
46	import java.io.StringReader;
47	import java.io.File;
48	import java.io.FileInputStream;
49	import java.io.FileReader;
50	import java.util.regex.*;
51
52	import org.apache.log4j.*;
53
54	// Apache Commons
55	import org.apache.commons.lang3.*;
56
57	import java.util.*;
58	import java.lang.reflect.*;
59
60	/**
61	* XMLConverter - utility class for greenstone
62	*
63	* generates new Documents
64	* parses XML Strings into Documents, converts Nodes to Strings
65	* different parsers have different behaviour - can experiment in here
66	* at the moment we only use xerces
67	* all xerces specific code is in here
68	*/
69	public class XMLConverter
70	{
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74	//protected EntityResolver resolver = null;
75
76	private static boolean outputEscaping = true;
77
78	/** the no-args constructor */
79	public XMLConverter()
80	{
81
82	}
83
84	// /** sets the entity resolver. pass in null to unset it */
85	// public void setEntityResolver(EntityResolver er)
86	// {
87	// this.resolver = er;
88	// }
89
90	/** returns a DOM Document */
91	public static Document getDOM(String in)
92	{
93
94	try
95	{
96	Reader reader = new StringReader(in);
97	InputSource xml_source = new InputSource(reader);
98	Document doc = getDOM(xml_source, null);
99	reader.close();
100	return doc;
101
102	}
103	catch (Exception e)
104	{
105	logger.error(e.getMessage());
106	}
107	return null;
108	}
109
110	/** returns a DOM Document */
111	public static Document getDOM(String in, String encoding)
112	{
113	try
114	{
115	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
116	InputSource xml_source = new InputSource(reader);
117	Document doc = getDOM(xml_source, null);
118	reader.close();
119	return doc;
120
121	}
122	catch (Exception e)
123	{
124	logger.error(e.getMessage());
125	}
126	return null;
127	}
128
129	/** returns a DOM Document */
130	public static Document getDOM(File in) {
131	try
132	{
133	FileReader reader = new FileReader(in);
134	InputSource xml_source = new InputSource(reader);
135	Document doc = getDOM(xml_source, null);
136	reader.close();
137	return doc;
138
139	}
140	catch (Exception e)
141	{
142	logger.error(e.getMessage(), e);
143
144	}
145	return null;
146	}
147
148	public static Document getDOM(File in, String encoding) {
149	return getDOM(in, encoding, null);
150	}
151
152	/** returns a DOM document */
153	public static Document getDOM(File in, String encoding, EntityResolver er) {
154
155	try {
156
157
158	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
159	InputSource xml_source = new InputSource(isr);
160	Document doc = getDOM(xml_source, er);
161	isr.close();
162	return doc;
163
164	}
165	catch (Exception e)
166	{
167	logger.error(e.getMessage());
168	}
169	return null;
170	}
171
172	public static Document getDOM(InputSource source, EntityResolver er) {
173
174	try {
175	DOMParser parser = new DOMParser();
176	parser.setFeature("http://xml.org/sax/features/validation", false);
177	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
178	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
179	// a performance test showed that having this on lead to increased
180	// memory use for small-medium docs, and not much gain for large
181	// docs.
182	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
183	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
184	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
185	parser.setErrorHandler(new ParseErrorHandler());
186	if (er != null) {
187	parser.setEntityResolver(er);
188	}
189	parser.parse(source);
190
191	Document doc = parser.getDocument();
192	return doc;
193
194	} catch (Exception e) {
195
196	logger.error(e.getMessage());
197	}
198	return null;
199
200	}
201
202	/** creates a new empty DOM Document */
203	public static Document newDOM()
204	{
205	Document doc = new DocumentImpl();
206	return doc;
207	}
208
209	/**
210	* This method's parameters represent the parts of the Doctype of this
211	* Document that is to be created. For more info see
212	* http://xerces.apache.org
213	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
214	* .html#DocumentTypeImpl
215	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
216	*
217	* */
218	public static Document newDOM(String qualifiedName, String publicID, String systemID)
219	{
220	// create empty DOM document
221	DocumentImpl docImpl = new DocumentImpl();
222
223	// Need to use the document to create the docType for it
224	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
225
226	// Although we have created the docType using the document, we need to still
227	// put it into the empty document we just created
228	try
229	{
230	docImpl.appendChild(myDocType);
231	}
232	catch (Exception e)
233	{
234	System.out.println("Could not append docType because: " + e);
235	}
236
237	// return the document containing a DocType
238	return docImpl;
239	}
240
241	/** returns the Node as a String */
242	public static String getString(Node xmlNode)
243	{
244	outputEscaping = true;
245	StringBuffer xmlRepresentation = new StringBuffer();
246	getString(xmlNode, xmlRepresentation, 0, false);
247	return xmlRepresentation.toString();
248	}
249
250	/**
251	* returns the node as a nicely formatted String - this introduces extra
252	* text nodes if the String is read back in as a DOM, so should only be used
253	* for printing
254	*/
255	public static String getPrettyString(Node xmlNode)
256	{
257
258	outputEscaping = true;
259	StringBuffer xmlRepresentation = new StringBuffer();
260	getString(xmlNode, xmlRepresentation, 0, true);
261	return xmlRepresentation.toString();
262	}
263
264	/*
265	* For the purposes of logger.debug statements, where this is called and
266	* hence outputted, returns an empty string if debugging is not enabled
267	*/
268	public static String getPrettyStringLogger(Node xmlNode, Logger log)
269	{
270
271	if (log.isDebugEnabled())
272	return getPrettyString(xmlNode);
273
274	return "";
275
276	}
277
278	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
279	{
280
281	if (xmlNode == null)
282	{
283	xmlRepresentation.append("<null>");
284	return;
285	}
286
287	short nodeType = xmlNode.getNodeType();
288	String nodeName = xmlNode.getNodeName();
289
290	if (nodeType == Node.DOCUMENT_NODE)
291	{
292	Document xmlDocNode = (Document) xmlNode;
293
294	//if (xmlDocNode.getDoctype() == null) {
295	//System.err.println("Doctype is null.");
296	//}
297	//else {
298	if (xmlDocNode.getDoctype() != null)
299	{
300	DocumentType dt = xmlDocNode.getDoctype();
301
302	String name = dt.getName();
303	String pid = dt.getPublicId();
304	String sid = dt.getSystemId();
305
306	// Use previously assigned name, not dt.getName() again
307	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
308
309	xmlRepresentation.append(doctype_str);
310	}
311	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
312	return;
313	}
314	// Handle Element nodes
315	if (nodeType == Node.ELEMENT_NODE)
316	{
317	if (pretty)
318	{
319	xmlRepresentation.append("\n");
320	for (int i = 0; i < depth; i++)
321	{
322	xmlRepresentation.append(" ");
323	}
324	}
325
326	// Write opening tag
327	xmlRepresentation.append("<");
328	xmlRepresentation.append(nodeName);
329
330	// Write the node attributes
331	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
332	for (int i = 0; i < nodeAttributes.getLength(); i++)
333	{
334	Node attribute = nodeAttributes.item(i);
335	xmlRepresentation.append(" ");
336	xmlRepresentation.append(attribute.getNodeName());
337	xmlRepresentation.append("=\"");
338	xmlRepresentation.append(attribute.getNodeValue());
339	xmlRepresentation.append("\"");
340	}
341
342	// If the node has no children, close the opening tag and return
343	if (xmlNode.hasChildNodes() == false)
344	{
345	// This produces somewhat ugly output, but it is necessary to compensate
346	// for display bugs in Netscape. Firstly, the space is needed before the
347	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
348	// example). Also, a newline character would be expected after the tag,
349	// but this causes problems with the display of links (the link text
350	// will contain a newline character, which is displayed badly).
351	xmlRepresentation.append(" />");
352	return;
353	}
354
355	// Close the opening tag
356	xmlRepresentation.append(">");
357
358	// Apply recursively to the children of this node
359	// hack for nodes next to text nodes - dont make them pretty
360	// this is needed for text inside a <pre> element - any new lines
361	// or spaces around the span elements show up in the text
362	NodeList children = xmlNode.getChildNodes();
363	boolean do_pretty = pretty;
364	for (int i = 0; i < children.getLength(); i++)
365	{
366	if (children.item(i).getNodeType() == Node.TEXT_NODE)
367	{
368	do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
369	}
370	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
371	}
372
373	// Write closing tag
374	if (pretty)
375	{
376	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
377	{
378	for (int i = 0; i < depth; i++)
379	xmlRepresentation.append(" ");
380	}
381	}
382	xmlRepresentation.append("</");
383	xmlRepresentation.append(nodeName);
384	xmlRepresentation.append(">");
385	if (pretty)
386	{
387	xmlRepresentation.append("\n");
388	}
389	}
390
391	// Handle Text nodes
392	else if (nodeType == Node.TEXT_NODE)
393	{
394	String text = xmlNode.getNodeValue();
395
396	// Perform output escaping, if required
397	// Apache Commons replace method is far superior to String.replaceAll - very fast!
398	if (outputEscaping)
399	{
400
401	text = StringUtils.replace(text, "&", "&");
402	text = StringUtils.replace(text, "<", "<");
403	text = StringUtils.replace(text, ">", ">");
404	text = StringUtils.replace(text, "'", "'");
405	text = StringUtils.replace(text, "\"", """);
406	}
407
408	// Remove any control-C characters
409	text = StringUtils.replace(text, "" + (char) 3, "");
410
411	xmlRepresentation.append(text);
412	}
413
414	// Handle Processing Instruction nodes
415	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
416	{
417	if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
418	{
419	outputEscaping = false;
420	}
421	else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
422	{
423	outputEscaping = true;
424	}
425	else
426	{
427	logger.warn("Unhandled processing instruction " + nodeName);
428	}
429	}
430
431	else if (nodeType == Node.COMMENT_NODE)
432	{
433	String text = xmlNode.getNodeValue();
434	xmlRepresentation.append("<!-- ");
435	xmlRepresentation.append(text);
436	xmlRepresentation.append(" -->");
437	}
438
439	// A type of node that is not handled yet
440	else
441	{
442	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
443	}
444
445	return;
446	}
447
448	protected static String getNodeTypeString(short node_type)
449	{
450
451	String type = "";
452	switch (node_type)
453	{
454	case Node.ATTRIBUTE_NODE:
455	type = "ATTRIBUTE_NODE";
456	break;
457	case Node.CDATA_SECTION_NODE:
458	type = "CDATA_SECTION_NODE";
459	break;
460	case Node.COMMENT_NODE:
461	type = "COMMENT_NODE";
462	break;
463	case Node.DOCUMENT_FRAGMENT_NODE:
464	type = "DOCUMENT_FRAGMENT_NODE";
465	break;
466	case Node.DOCUMENT_NODE:
467	type = "DOCUMENT_NODE";
468	break;
469	case Node.DOCUMENT_TYPE_NODE:
470	type = "DOCUMENT_TYPE_NODE";
471	break;
472	case Node.ELEMENT_NODE:
473	type = "ELEMENT_NODE";
474	break;
475	case Node.ENTITY_NODE:
476	type = "ENTITY_NODE";
477	break;
478	case Node.ENTITY_REFERENCE_NODE:
479	type = "ENTITY_REFERENCE_NODE";
480	break;
481	case Node.NOTATION_NODE:
482	type = "NOTATION_NODE";
483	break;
484	case Node.PROCESSING_INSTRUCTION_NODE:
485	type = "PROCESSING_INSTRUCTION_NODE";
486	break;
487	case Node.TEXT_NODE:
488	type = "TEXT_NODE";
489	break;
490	default:
491	type = "UNKNOWN";
492	}
493
494	return type;
495	}
496
497	// returns null if there no error occurred during parsing, or else returns the error message
498
499	// public String getParseErrorMessage()
500	// {
501	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
502	// return errorHandler.getErrorMessage();
503	// }
504
505	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
506	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
507	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
508	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
509	static public class ParseErrorHandler implements ErrorHandler
510	{
511	protected String errorMessage = null;
512
513	// Receive notification of a recoverable error.
514	public void error(SAXParseException exception)
515	{
516	handleError("Error:\n", exception);
517	}
518
519	// Receive notification of a non-recoverable error.
520	public void fatalError(SAXParseException exception)
521	{
522	handleError("Fatal Error:\n", exception);
523	}
524
525	// Receive notification of a warning.
526	public void warning(SAXParseException exception)
527	{
528	handleError("Warning:\n", exception);
529	}
530
531	public String toString(SAXParseException e)
532	{
533	String msg = e.getMessage();
534	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
535	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
536	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
537
538	return msg;
539	}
540
541	// clears the errorPage variable after first call to this method
542	public String getErrorMessage()
543	{
544	String errMsg = this.errorMessage;
545	if (this.errorMessage != null)
546	{
547	this.errorMessage = null;
548	}
549	return errMsg;
550	}
551
552	// sets the errorMessage member variable to the data stored in the exception
553	// and writes the errorMessage to the logger and tomcat's System.err
554	protected void handleError(String errorType, SAXParseException exception)
555	{
556	this.errorMessage = errorType + toString(exception);
557	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
558	logger.error(this.errorMessage);
559	}
560	}
561
562	public static boolean writeDOM(Element elem, File file) {
563
564	BufferedWriter writer = null;
565	boolean success = false;
566	try {
567	String xml_string = getString(elem);
568	// need createNewFile???
569	writer = new BufferedWriter(new FileWriter(file));
570	writer.write(xml_string);
571	success = true;
572	}
573
574	catch (Exception e) {
575	logger.error(e.getMessage());
576	success = false;
577	}
578	finally {
579	try {
580	if (writer != null) {
581	writer.close();
582	}
583	} catch(Exception e) {
584	logger.error("couldn't close the file"+e.getMessage());
585	}
586	}
587	return success;
588	}
589	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: