Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 29728

Last change on this file since 29728 was 29728, checked in by ak19, 9 years ago
Adjusting gs3-server code to work with changes made to 29686 where web.xml was split into web.xml and servlets.xml, with the first including the second.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.xml.sax.SAXNotRecognizedException;
34	import org.xml.sax.SAXNotSupportedException;
35	import org.apache.xerces.parsers.DOMParser;
36	import org.apache.xerces.dom.DocumentImpl; // for new Documents
37	import org.apache.xerces.dom.DocumentTypeImpl;
38
39	// other java classes
40	import java.io.BufferedWriter;
41	import java.io.ByteArrayInputStream;
42	import java.io.FileWriter;
43	import java.io.InputStream;
44	import java.io.Reader;
45	import java.io.InputStreamReader;
46	import java.io.StringReader;
47	import java.io.File;
48	import java.io.FileInputStream;
49	import java.io.FileReader;
50	import java.util.regex.*;
51
52	import org.apache.log4j.*;
53
54	// Apache Commons
55	import org.apache.commons.lang3.*;
56
57	import java.util.*;
58	import java.lang.reflect.*;
59
60	/**
61	* XMLConverter - utility class for greenstone
62	*
63	* generates new Documents
64	* parses XML Strings into Documents, converts Nodes to Strings
65	* different parsers have different behaviour - can experiment in here
66	* at the moment we only use xerces
67	* all xerces specific code is in here
68	*/
69	public class XMLConverter
70	{
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74	/** the no-args constructor */
75	public XMLConverter()
76	{
77
78	}
79
80	/** returns a DOM Document */
81	public static Document getDOM(String in)
82	{
83
84	try
85	{
86	Reader reader = new StringReader(in);
87	InputSource xml_source = new InputSource(reader);
88	Document doc = getDOM(xml_source, null);
89	reader.close();
90	return doc;
91
92	}
93	catch (Exception e)
94	{
95	logger.error(e.getMessage());
96	}
97	return null;
98	}
99
100	/** returns a DOM Document */
101	public static Document getDOM(String in, String encoding)
102	{
103	try
104	{
105	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
106	InputSource xml_source = new InputSource(reader);
107	Document doc = getDOM(xml_source, null);
108	reader.close();
109	return doc;
110
111	}
112	catch (Exception e)
113	{
114	logger.error(e.getMessage());
115	}
116	return null;
117	}
118
119	/** returns a DOM Document */
120	public static Document getDOM(File in) {
121	try
122	{
123	FileReader reader = new FileReader(in);
124	InputSource xml_source = new InputSource(reader);
125	Document doc = getDOM(xml_source, null);
126	reader.close();
127	return doc;
128
129	}
130	catch (Exception e)
131	{
132	logger.error(e.getMessage(), e);
133
134	}
135	return null;
136	}
137
138	public static Document getDOM(File in, String encoding) {
139	return getDOM(in, encoding, null);
140	}
141
142	/** returns a DOM document */
143	public static Document getDOM(File in, String encoding, EntityResolver er) {
144
145	try {
146
147
148	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
149	InputSource xml_source = new InputSource(isr);
150	Document doc = getDOM(xml_source, er);
151	isr.close();
152	return doc;
153
154	}
155	catch (Exception e)
156	{
157	logger.error(e.getMessage());
158	}
159	return null;
160	}
161
162	public static Document getDOM(File in, EntityResolver er) {
163
164	try {
165	InputSource xml_source = new InputSource(new FileInputStream(in));
166	Document doc = getDOM(xml_source, er);
167	return doc;
168	}
169	catch (Exception e)
170	{
171	logger.error(e.getMessage());
172	}
173	return null;
174	}
175
176	public static Document getDOM(InputSource source, EntityResolver er) {
177
178	try {
179	DOMParser parser = new DOMParser();
180	parser.setFeature("http://xml.org/sax/features/validation", false);
181	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
182	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
183	// a performance test showed that having this on lead to increased
184	// memory use for small-medium docs, and not much gain for large
185	// docs.
186	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
187	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
188	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
189	parser.setErrorHandler(new ParseErrorHandler());
190	if (er != null) {
191	parser.setEntityResolver(er);
192	}
193	parser.parse(source);
194
195	Document doc = parser.getDocument();
196	return doc;
197
198	} catch (Exception e) {
199
200	logger.error(e.getMessage());
201	}
202	return null;
203
204	}
205
206	/** creates a new empty DOM Document */
207	public static Document newDOM()
208	{
209	Document doc = new DocumentImpl();
210	return doc;
211	}
212
213	/**
214	* This method's parameters represent the parts of the Doctype of this
215	* Document that is to be created. For more info see
216	* http://xerces.apache.org
217	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
218	* .html#DocumentTypeImpl
219	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
220	*
221	* */
222	public static Document newDOM(String qualifiedName, String publicID, String systemID)
223	{
224	// create empty DOM document
225	DocumentImpl docImpl = new DocumentImpl();
226
227	// Need to use the document to create the docType for it
228	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
229
230	// Although we have created the docType using the document, we need to still
231	// put it into the empty document we just created
232	try
233	{
234	docImpl.appendChild(myDocType);
235	}
236	catch (Exception e)
237	{
238	System.out.println("Could not append docType because: " + e);
239	}
240
241	// return the document containing a DocType
242	return docImpl;
243	}
244
245	/** returns the Node as a String */
246	public static String getString(Node xmlNode)
247	{
248	StringBuffer xmlRepresentation = new StringBuffer();
249	getString(xmlNode, xmlRepresentation, 0, false);
250	return xmlRepresentation.toString();
251	}
252
253	/**
254	* returns the node as a nicely formatted String - this introduces extra
255	* text nodes if the String is read back in as a DOM, so should only be used
256	* for printing
257	*/
258	public static String getPrettyString(Node xmlNode)
259	{
260	StringBuffer xmlRepresentation = new StringBuffer();
261	getString(xmlNode, xmlRepresentation, 0, true);
262	return xmlRepresentation.toString();
263	}
264
265	/*
266	* For the purposes of logger.debug statements, where this is called and
267	* hence outputted, returns an empty string if debugging is not enabled
268	*/
269	public static String getPrettyStringLogger(Node xmlNode, Logger log)
270	{
271
272	if (log.isDebugEnabled())
273	return getPrettyString(xmlNode);
274
275	return "";
276
277	}
278
279	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
280	{
281
282	if (xmlNode == null)
283	{
284	xmlRepresentation.append("<null>");
285	return;
286	}
287
288	short nodeType = xmlNode.getNodeType();
289	String nodeName = xmlNode.getNodeName();
290
291	if (nodeType == Node.DOCUMENT_NODE)
292	{
293	Document xmlDocNode = (Document) xmlNode;
294
295	//if (xmlDocNode.getDoctype() == null) {
296	//System.err.println("Doctype is null.");
297	//}
298	//else {
299	if (xmlDocNode.getDoctype() != null)
300	{
301	DocumentType dt = xmlDocNode.getDoctype();
302
303	String name = dt.getName();
304	String pid = dt.getPublicId();
305	String sid = dt.getSystemId();
306
307	// Use previously assigned name, not dt.getName() again
308	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
309
310	xmlRepresentation.append(doctype_str);
311	}
312	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
313	return;
314	}
315	// Handle Element nodes
316	if (nodeType == Node.ELEMENT_NODE)
317	{
318	if (pretty)
319	{
320	xmlRepresentation.append("\n");
321	for (int i = 0; i < depth; i++)
322	{
323	xmlRepresentation.append(" ");
324	}
325	}
326
327	// Write opening tag
328	xmlRepresentation.append("<");
329	xmlRepresentation.append(nodeName);
330
331	// Write the node attributes
332	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
333	for (int i = 0; i < nodeAttributes.getLength(); i++)
334	{
335	Node attribute = nodeAttributes.item(i);
336	xmlRepresentation.append(" ");
337	xmlRepresentation.append(attribute.getNodeName());
338	xmlRepresentation.append("=\"");
339	xmlRepresentation.append(attribute.getNodeValue());
340	xmlRepresentation.append("\"");
341	}
342
343	// If the node has no children, close the opening tag and return
344	if (xmlNode.hasChildNodes() == false)
345	{
346	// This produces somewhat ugly output, but it is necessary to compensate
347	// for display bugs in Netscape. Firstly, the space is needed before the
348	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
349	// example). Also, a newline character would be expected after the tag,
350	// but this causes problems with the display of links (the link text
351	// will contain a newline character, which is displayed badly).
352	xmlRepresentation.append(" />");
353	return;
354	}
355
356	// Close the opening tag
357	xmlRepresentation.append(">");
358
359	// Process the children. We process text nodes here, but recursively process other nodes.
360	// hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
361	// Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
362	// also if these are inside a pre tag then the space shows up in the page.
363
364	NodeList children = xmlNode.getChildNodes();
365	boolean do_pretty = pretty;
366	boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
367	for (int i = 0; i < children.getLength(); i++)
368	{
369	Node child = children.item(i);
370	short child_type = child.getNodeType();
371	if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
372	if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
373	output_escaping = false;
374	}
375	else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
376	output_escaping = true;
377	}
378	else {
379	logger.warn("Unhandled processing instruction " + child.getNodeName());
380	}
381	}
382	else if (child_type == Node.TEXT_NODE) {
383	do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
384	// output the text
385	String text = child.getNodeValue();
386
387	// Perform output escaping, if required
388	// Apache Commons replace method is far superior to String.replaceAll - very fast!
389	if (output_escaping) {
390	text = StringUtils.replace(text, "&", "&");
391	text = StringUtils.replace(text, "<", "<");
392	text = StringUtils.replace(text, ">", ">");
393	text = StringUtils.replace(text, "'", "'");
394	text = StringUtils.replace(text, "\"", """);
395	}
396	// Remove any control-C characters
397	text = StringUtils.replace(text, "" + (char) 3, "");
398
399	xmlRepresentation.append(text);
400
401	}
402	else {
403	// recursively call getString
404	getString(child, xmlRepresentation, depth + 1, do_pretty);
405	}
406	} // foreach child of the element
407
408	// Write closing tag
409	if (pretty)
410	{
411	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
412	{
413	for (int i = 0; i < depth; i++)
414	xmlRepresentation.append(" ");
415	}
416	}
417	xmlRepresentation.append("</");
418	xmlRepresentation.append(nodeName);
419	xmlRepresentation.append(">");
420	if (pretty)
421	{
422	xmlRepresentation.append("\n");
423	}
424	} // ELEMENT_NODE
425
426	else if (nodeType == Node.COMMENT_NODE)
427	{
428	String text = xmlNode.getNodeValue();
429	xmlRepresentation.append("<!-- ");
430	xmlRepresentation.append(text);
431	xmlRepresentation.append(" -->");
432	}
433
434	// TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
435	// A type of node that is not handled yet
436	else
437	{
438	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
439	}
440
441	return;
442	}
443
444	protected static String getNodeTypeString(short node_type)
445	{
446
447	String type = "";
448	switch (node_type)
449	{
450	case Node.ATTRIBUTE_NODE:
451	type = "ATTRIBUTE_NODE";
452	break;
453	case Node.CDATA_SECTION_NODE:
454	type = "CDATA_SECTION_NODE";
455	break;
456	case Node.COMMENT_NODE:
457	type = "COMMENT_NODE";
458	break;
459	case Node.DOCUMENT_FRAGMENT_NODE:
460	type = "DOCUMENT_FRAGMENT_NODE";
461	break;
462	case Node.DOCUMENT_NODE:
463	type = "DOCUMENT_NODE";
464	break;
465	case Node.DOCUMENT_TYPE_NODE:
466	type = "DOCUMENT_TYPE_NODE";
467	break;
468	case Node.ELEMENT_NODE:
469	type = "ELEMENT_NODE";
470	break;
471	case Node.ENTITY_NODE:
472	type = "ENTITY_NODE";
473	break;
474	case Node.ENTITY_REFERENCE_NODE:
475	type = "ENTITY_REFERENCE_NODE";
476	break;
477	case Node.NOTATION_NODE:
478	type = "NOTATION_NODE";
479	break;
480	case Node.PROCESSING_INSTRUCTION_NODE:
481	type = "PROCESSING_INSTRUCTION_NODE";
482	break;
483	case Node.TEXT_NODE:
484	type = "TEXT_NODE";
485	break;
486	default:
487	type = "UNKNOWN";
488	}
489
490	return type;
491	}
492
493	// returns null if there no error occurred during parsing, or else returns the error message
494
495	// public String getParseErrorMessage()
496	// {
497	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
498	// return errorHandler.getErrorMessage();
499	// }
500
501	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
502	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
503	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
504	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
505	static public class ParseErrorHandler implements ErrorHandler
506	{
507	protected String errorMessage = null;
508
509	// Receive notification of a recoverable error.
510	public void error(SAXParseException exception)
511	{
512	handleError("Error:\n", exception);
513	}
514
515	// Receive notification of a non-recoverable error.
516	public void fatalError(SAXParseException exception)
517	{
518	handleError("Fatal Error:\n", exception);
519	}
520
521	// Receive notification of a warning.
522	public void warning(SAXParseException exception)
523	{
524	handleError("Warning:\n", exception);
525	}
526
527	public String toString(SAXParseException e)
528	{
529	String msg = e.getMessage();
530	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
531	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
532	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
533
534	return msg;
535	}
536
537	// clears the errorPage variable after first call to this method
538	public String getErrorMessage()
539	{
540	String errMsg = this.errorMessage;
541	if (this.errorMessage != null)
542	{
543	this.errorMessage = null;
544	}
545	return errMsg;
546	}
547
548	// sets the errorMessage member variable to the data stored in the exception
549	// and writes the errorMessage to the logger and tomcat's System.err
550	protected void handleError(String errorType, SAXParseException exception)
551	{
552	this.errorMessage = errorType + toString(exception);
553	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
554	logger.error(this.errorMessage);
555	}
556	}
557
558	public static boolean writeDOM(Element elem, File file) {
559
560	BufferedWriter writer = null;
561	boolean success = false;
562	try {
563	String xml_string = getString(elem);
564	// need createNewFile???
565	writer = new BufferedWriter(new FileWriter(file));
566	writer.write(xml_string);
567	success = true;
568	}
569
570	catch (Exception e) {
571	logger.error(e.getMessage());
572	success = false;
573	}
574	finally {
575	try {
576	if (writer != null) {
577	writer.close();
578	}
579	} catch(Exception e) {
580	logger.error("couldn't close the file"+e.getMessage());
581	}
582	}
583	return success;
584	}
585	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: