Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28978

Last change on this file since 28978 was 28978, checked in by kjdon, 10 years ago
removed commented out code. removed static variable outputEscaping. what will happen if two threads are doing pretty printing???. Now we process text and processing instruction nodes in a line, not recursively, so can use the processing instructions when processing the text nods in the for loop.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.4 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.xml.sax.SAXNotRecognizedException;
34	import org.xml.sax.SAXNotSupportedException;
35	import org.apache.xerces.parsers.DOMParser;
36	import org.apache.xerces.dom.DocumentImpl; // for new Documents
37	import org.apache.xerces.dom.DocumentTypeImpl;
38
39	// other java classes
40	import java.io.BufferedWriter;
41	import java.io.ByteArrayInputStream;
42	import java.io.FileWriter;
43	import java.io.InputStream;
44	import java.io.Reader;
45	import java.io.InputStreamReader;
46	import java.io.StringReader;
47	import java.io.File;
48	import java.io.FileInputStream;
49	import java.io.FileReader;
50	import java.util.regex.*;
51
52	import org.apache.log4j.*;
53
54	// Apache Commons
55	import org.apache.commons.lang3.*;
56
57	import java.util.*;
58	import java.lang.reflect.*;
59
60	/**
61	* XMLConverter - utility class for greenstone
62	*
63	* generates new Documents
64	* parses XML Strings into Documents, converts Nodes to Strings
65	* different parsers have different behaviour - can experiment in here
66	* at the moment we only use xerces
67	* all xerces specific code is in here
68	*/
69	public class XMLConverter
70	{
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74	/** the no-args constructor */
75	public XMLConverter()
76	{
77
78	}
79
80	/** returns a DOM Document */
81	public static Document getDOM(String in)
82	{
83
84	try
85	{
86	Reader reader = new StringReader(in);
87	InputSource xml_source = new InputSource(reader);
88	Document doc = getDOM(xml_source, null);
89	reader.close();
90	return doc;
91
92	}
93	catch (Exception e)
94	{
95	logger.error(e.getMessage());
96	}
97	return null;
98	}
99
100	/** returns a DOM Document */
101	public static Document getDOM(String in, String encoding)
102	{
103	try
104	{
105	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
106	InputSource xml_source = new InputSource(reader);
107	Document doc = getDOM(xml_source, null);
108	reader.close();
109	return doc;
110
111	}
112	catch (Exception e)
113	{
114	logger.error(e.getMessage());
115	}
116	return null;
117	}
118
119	/** returns a DOM Document */
120	public static Document getDOM(File in) {
121	try
122	{
123	FileReader reader = new FileReader(in);
124	InputSource xml_source = new InputSource(reader);
125	Document doc = getDOM(xml_source, null);
126	reader.close();
127	return doc;
128
129	}
130	catch (Exception e)
131	{
132	logger.error(e.getMessage(), e);
133
134	}
135	return null;
136	}
137
138	public static Document getDOM(File in, String encoding) {
139	return getDOM(in, encoding, null);
140	}
141
142	/** returns a DOM document */
143	public static Document getDOM(File in, String encoding, EntityResolver er) {
144
145	try {
146
147
148	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
149	InputSource xml_source = new InputSource(isr);
150	Document doc = getDOM(xml_source, er);
151	isr.close();
152	return doc;
153
154	}
155	catch (Exception e)
156	{
157	logger.error(e.getMessage());
158	}
159	return null;
160	}
161
162	public static Document getDOM(InputSource source, EntityResolver er) {
163
164	try {
165	DOMParser parser = new DOMParser();
166	parser.setFeature("http://xml.org/sax/features/validation", false);
167	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
168	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
169	// a performance test showed that having this on lead to increased
170	// memory use for small-medium docs, and not much gain for large
171	// docs.
172	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
173	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
174	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
175	parser.setErrorHandler(new ParseErrorHandler());
176	if (er != null) {
177	parser.setEntityResolver(er);
178	}
179	parser.parse(source);
180
181	Document doc = parser.getDocument();
182	return doc;
183
184	} catch (Exception e) {
185
186	logger.error(e.getMessage());
187	}
188	return null;
189
190	}
191
192	/** creates a new empty DOM Document */
193	public static Document newDOM()
194	{
195	Document doc = new DocumentImpl();
196	return doc;
197	}
198
199	/**
200	* This method's parameters represent the parts of the Doctype of this
201	* Document that is to be created. For more info see
202	* http://xerces.apache.org
203	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
204	* .html#DocumentTypeImpl
205	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
206	*
207	* */
208	public static Document newDOM(String qualifiedName, String publicID, String systemID)
209	{
210	// create empty DOM document
211	DocumentImpl docImpl = new DocumentImpl();
212
213	// Need to use the document to create the docType for it
214	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
215
216	// Although we have created the docType using the document, we need to still
217	// put it into the empty document we just created
218	try
219	{
220	docImpl.appendChild(myDocType);
221	}
222	catch (Exception e)
223	{
224	System.out.println("Could not append docType because: " + e);
225	}
226
227	// return the document containing a DocType
228	return docImpl;
229	}
230
231	/** returns the Node as a String */
232	public static String getString(Node xmlNode)
233	{
234	StringBuffer xmlRepresentation = new StringBuffer();
235	getString(xmlNode, xmlRepresentation, 0, false);
236	return xmlRepresentation.toString();
237	}
238
239	/**
240	* returns the node as a nicely formatted String - this introduces extra
241	* text nodes if the String is read back in as a DOM, so should only be used
242	* for printing
243	*/
244	public static String getPrettyString(Node xmlNode)
245	{
246	StringBuffer xmlRepresentation = new StringBuffer();
247	getString(xmlNode, xmlRepresentation, 0, true);
248	return xmlRepresentation.toString();
249	}
250
251	/*
252	* For the purposes of logger.debug statements, where this is called and
253	* hence outputted, returns an empty string if debugging is not enabled
254	*/
255	public static String getPrettyStringLogger(Node xmlNode, Logger log)
256	{
257
258	if (log.isDebugEnabled())
259	return getPrettyString(xmlNode);
260
261	return "";
262
263	}
264
265	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
266	{
267
268	if (xmlNode == null)
269	{
270	xmlRepresentation.append("<null>");
271	return;
272	}
273
274	short nodeType = xmlNode.getNodeType();
275	String nodeName = xmlNode.getNodeName();
276
277	if (nodeType == Node.DOCUMENT_NODE)
278	{
279	Document xmlDocNode = (Document) xmlNode;
280
281	//if (xmlDocNode.getDoctype() == null) {
282	//System.err.println("Doctype is null.");
283	//}
284	//else {
285	if (xmlDocNode.getDoctype() != null)
286	{
287	DocumentType dt = xmlDocNode.getDoctype();
288
289	String name = dt.getName();
290	String pid = dt.getPublicId();
291	String sid = dt.getSystemId();
292
293	// Use previously assigned name, not dt.getName() again
294	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
295
296	xmlRepresentation.append(doctype_str);
297	}
298	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
299	return;
300	}
301	// Handle Element nodes
302	if (nodeType == Node.ELEMENT_NODE)
303	{
304	if (pretty)
305	{
306	xmlRepresentation.append("\n");
307	for (int i = 0; i < depth; i++)
308	{
309	xmlRepresentation.append(" ");
310	}
311	}
312
313	// Write opening tag
314	xmlRepresentation.append("<");
315	xmlRepresentation.append(nodeName);
316
317	// Write the node attributes
318	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
319	for (int i = 0; i < nodeAttributes.getLength(); i++)
320	{
321	Node attribute = nodeAttributes.item(i);
322	xmlRepresentation.append(" ");
323	xmlRepresentation.append(attribute.getNodeName());
324	xmlRepresentation.append("=\"");
325	xmlRepresentation.append(attribute.getNodeValue());
326	xmlRepresentation.append("\"");
327	}
328
329	// If the node has no children, close the opening tag and return
330	if (xmlNode.hasChildNodes() == false)
331	{
332	// This produces somewhat ugly output, but it is necessary to compensate
333	// for display bugs in Netscape. Firstly, the space is needed before the
334	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
335	// example). Also, a newline character would be expected after the tag,
336	// but this causes problems with the display of links (the link text
337	// will contain a newline character, which is displayed badly).
338	xmlRepresentation.append(" />");
339	return;
340	}
341
342	// Close the opening tag
343	xmlRepresentation.append(">");
344
345	// Process the children. We process text nodes here, but recursively process other nodes.
346	// hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
347	// Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
348	// also if these are inside a pre tag then the space shows up in the page.
349
350	NodeList children = xmlNode.getChildNodes();
351	boolean do_pretty = pretty;
352	boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
353	for (int i = 0; i < children.getLength(); i++)
354	{
355	Node child = children.item(i);
356	short child_type = child.getNodeType();
357	if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
358	if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
359	output_escaping = false;
360	}
361	else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
362	output_escaping = true;
363	}
364	else {
365	logger.warn("Unhandled processing instruction " + child.getNodeName());
366	}
367	}
368	else if (child_type == Node.TEXT_NODE) {
369	do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
370	// output the text
371	String text = child.getNodeValue();
372
373	// Perform output escaping, if required
374	// Apache Commons replace method is far superior to String.replaceAll - very fast!
375	if (output_escaping) {
376	text = StringUtils.replace(text, "&", "&");
377	text = StringUtils.replace(text, "<", "<");
378	text = StringUtils.replace(text, ">", ">");
379	text = StringUtils.replace(text, "'", "'");
380	text = StringUtils.replace(text, "\"", """);
381	}
382	// Remove any control-C characters
383	text = StringUtils.replace(text, "" + (char) 3, "");
384
385	xmlRepresentation.append(text);
386
387	}
388	else {
389	// recursively call getString
390	getString(child, xmlRepresentation, depth + 1, do_pretty);
391	}
392	} // foreach child of the element
393
394	// Write closing tag
395	if (pretty)
396	{
397	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
398	{
399	for (int i = 0; i < depth; i++)
400	xmlRepresentation.append(" ");
401	}
402	}
403	xmlRepresentation.append("</");
404	xmlRepresentation.append(nodeName);
405	xmlRepresentation.append(">");
406	if (pretty)
407	{
408	xmlRepresentation.append("\n");
409	}
410	} // ELEMENT_NODE
411
412	else if (nodeType == Node.COMMENT_NODE)
413	{
414	String text = xmlNode.getNodeValue();
415	xmlRepresentation.append("<!-- ");
416	xmlRepresentation.append(text);
417	xmlRepresentation.append(" -->");
418	}
419
420	// TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
421	// A type of node that is not handled yet
422	else
423	{
424	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
425	}
426
427	return;
428	}
429
430	protected static String getNodeTypeString(short node_type)
431	{
432
433	String type = "";
434	switch (node_type)
435	{
436	case Node.ATTRIBUTE_NODE:
437	type = "ATTRIBUTE_NODE";
438	break;
439	case Node.CDATA_SECTION_NODE:
440	type = "CDATA_SECTION_NODE";
441	break;
442	case Node.COMMENT_NODE:
443	type = "COMMENT_NODE";
444	break;
445	case Node.DOCUMENT_FRAGMENT_NODE:
446	type = "DOCUMENT_FRAGMENT_NODE";
447	break;
448	case Node.DOCUMENT_NODE:
449	type = "DOCUMENT_NODE";
450	break;
451	case Node.DOCUMENT_TYPE_NODE:
452	type = "DOCUMENT_TYPE_NODE";
453	break;
454	case Node.ELEMENT_NODE:
455	type = "ELEMENT_NODE";
456	break;
457	case Node.ENTITY_NODE:
458	type = "ENTITY_NODE";
459	break;
460	case Node.ENTITY_REFERENCE_NODE:
461	type = "ENTITY_REFERENCE_NODE";
462	break;
463	case Node.NOTATION_NODE:
464	type = "NOTATION_NODE";
465	break;
466	case Node.PROCESSING_INSTRUCTION_NODE:
467	type = "PROCESSING_INSTRUCTION_NODE";
468	break;
469	case Node.TEXT_NODE:
470	type = "TEXT_NODE";
471	break;
472	default:
473	type = "UNKNOWN";
474	}
475
476	return type;
477	}
478
479	// returns null if there no error occurred during parsing, or else returns the error message
480
481	// public String getParseErrorMessage()
482	// {
483	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
484	// return errorHandler.getErrorMessage();
485	// }
486
487	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
488	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
489	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
490	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
491	static public class ParseErrorHandler implements ErrorHandler
492	{
493	protected String errorMessage = null;
494
495	// Receive notification of a recoverable error.
496	public void error(SAXParseException exception)
497	{
498	handleError("Error:\n", exception);
499	}
500
501	// Receive notification of a non-recoverable error.
502	public void fatalError(SAXParseException exception)
503	{
504	handleError("Fatal Error:\n", exception);
505	}
506
507	// Receive notification of a warning.
508	public void warning(SAXParseException exception)
509	{
510	handleError("Warning:\n", exception);
511	}
512
513	public String toString(SAXParseException e)
514	{
515	String msg = e.getMessage();
516	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
517	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
518	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
519
520	return msg;
521	}
522
523	// clears the errorPage variable after first call to this method
524	public String getErrorMessage()
525	{
526	String errMsg = this.errorMessage;
527	if (this.errorMessage != null)
528	{
529	this.errorMessage = null;
530	}
531	return errMsg;
532	}
533
534	// sets the errorMessage member variable to the data stored in the exception
535	// and writes the errorMessage to the logger and tomcat's System.err
536	protected void handleError(String errorType, SAXParseException exception)
537	{
538	this.errorMessage = errorType + toString(exception);
539	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
540	logger.error(this.errorMessage);
541	}
542	}
543
544	public static boolean writeDOM(Element elem, File file) {
545
546	BufferedWriter writer = null;
547	boolean success = false;
548	try {
549	String xml_string = getString(elem);
550	// need createNewFile???
551	writer = new BufferedWriter(new FileWriter(file));
552	writer.write(xml_string);
553	success = true;
554	}
555
556	catch (Exception e) {
557	logger.error(e.getMessage());
558	success = false;
559	}
560	finally {
561	try {
562	if (writer != null) {
563	writer.close();
564	}
565	} catch(Exception e) {
566	logger.error("couldn't close the file"+e.getMessage());
567	}
568	}
569	return success;
570	}
571	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: