Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 33897

Last change on this file since 33897 was 33897, checked in by kjdon, 4 years ago
elsewhere in the code - GSXML.xmlSafe, we are escaping ' => ' we need to make sure we recover this too, otherwise get ' in the interface eg in search box if we have an apostrophe in search term.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.xml.sax.SAXNotRecognizedException;
34	import org.xml.sax.SAXNotSupportedException;
35	import org.apache.xerces.parsers.DOMParser;
36	import org.apache.xerces.dom.DocumentImpl; // for new Documents
37	import org.apache.xerces.dom.DocumentTypeImpl;
38
39	// other java classes
40	import java.io.BufferedWriter;
41	import java.io.ByteArrayInputStream;
42	import java.io.FileWriter;
43	import java.io.InputStream;
44	import java.io.Reader;
45	import java.io.InputStreamReader;
46	import java.io.StringReader;
47	import java.io.File;
48	import java.io.FileInputStream;
49	import java.io.FileReader;
50	import java.util.regex.*;
51
52	import org.apache.log4j.*;
53
54	// Apache Commons
55	import org.apache.commons.lang3.*;
56
57	import java.util.*;
58	import java.lang.reflect.*;
59
60	/**
61	* XMLConverter - utility class for greenstone
62	*
63	* generates new Documents
64	* parses XML Strings into Documents, converts Nodes to Strings
65	* different parsers have different behaviour - can experiment in here
66	* at the moment we only use xerces
67	* all xerces specific code is in here
68	*/
69	public class XMLConverter
70	{
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74	/** the no-args constructor */
75	public XMLConverter()
76	{
77
78	}
79
80	/** returns a DOM Document */
81	public static Document getDOM(String in)
82	{
83
84	try
85	{
86	Reader reader = new StringReader(in);
87	InputSource xml_source = new InputSource(reader);
88	Document doc = getDOM(xml_source, null);
89	reader.close();
90	return doc;
91
92	}
93	catch (Exception e)
94	{
95	logger.error(e.getMessage());
96	logger.error("Input string was:\n" + in);
97	e.printStackTrace();
98	}
99	return null;
100	}
101
102	/** returns a DOM Document */
103	public static Document getDOM(String in, String encoding)
104	{
105	try
106	{
107	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
108	InputSource xml_source = new InputSource(reader);
109	Document doc = getDOM(xml_source, null);
110	reader.close();
111	return doc;
112
113	}
114	catch (Exception e)
115	{
116	logger.error(e.getMessage());
117	logger.error("Input string was:\n" + in);
118	e.printStackTrace();
119	}
120	return null;
121	}
122
123	/** returns a DOM Document */
124	public static Document getDOM(File in) {
125	try
126	{
127	FileReader reader = new FileReader(in);
128	InputSource xml_source = new InputSource(reader);
129	Document doc = getDOM(xml_source, null);
130	reader.close();
131	return doc;
132
133	}
134	catch (Exception e)
135	{
136	logger.error(e.getMessage(), e);
137	logger.error("File was:\n" + in.getPath());
138	e.printStackTrace();
139
140	}
141	return null;
142	}
143
144	public static Document getDOM(File in, String encoding) {
145	return getDOM(in, encoding, null);
146	}
147
148	/** returns a DOM document */
149	public static Document getDOM(File in, String encoding, EntityResolver er) {
150
151	try {
152
153
154	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
155	InputSource xml_source = new InputSource(isr);
156	Document doc = getDOM(xml_source, er);
157	isr.close();
158	return doc;
159
160	}
161	catch (Exception e)
162	{
163	logger.error(e.getMessage());
164	logger.error("File was:\n" + in.getPath());
165	e.printStackTrace();
166	}
167	return null;
168	}
169
170	public static Document getDOM(File in, EntityResolver er) {
171
172	try {
173	InputSource xml_source = new InputSource(new FileInputStream(in));
174	Document doc = getDOM(xml_source, er);
175	return doc;
176	}
177	catch (Exception e)
178	{
179	logger.error(e.getMessage());
180	logger.error("File was:\n" + in.getPath());
181	e.printStackTrace();
182	}
183	return null;
184	}
185
186	public static Document getDOM(InputSource source, EntityResolver er) {
187
188	try {
189	DOMParser parser = new DOMParser();
190	parser.setFeature("http://xml.org/sax/features/validation", false);
191	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
192	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
193	// a performance test showed that having this on lead to increased
194	// memory use for small-medium docs, and not much gain for large
195	// docs.
196	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
197	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
198	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
199	parser.setErrorHandler(new ParseErrorHandler());
200	if (er != null) {
201	parser.setEntityResolver(er);
202	}
203	parser.parse(source);
204
205	Document doc = parser.getDocument();
206	return doc;
207
208	} catch (Exception e) {
209
210	logger.error(e.getMessage());
211	logger.error("InputSource:\n");
212	e.printStackTrace();
213	}
214	return null;
215
216	}
217
218	/** creates a new empty DOM Document */
219	public static Document newDOM()
220	{
221	Document doc = new DocumentImpl();
222	return doc;
223	}
224
225	/**
226	* This method's parameters represent the parts of the Doctype of this
227	* Document that is to be created. For more info see
228	* http://xerces.apache.org
229	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
230	* .html#DocumentTypeImpl
231	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
232	*
233	* */
234	public static Document newDOM(String qualifiedName, String publicID, String systemID)
235	{
236	// create empty DOM document
237	DocumentImpl docImpl = new DocumentImpl();
238
239	// Need to use the document to create the docType for it
240	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
241
242	// Although we have created the docType using the document, we need to still
243	// put it into the empty document we just created
244	try
245	{
246	docImpl.appendChild(myDocType);
247	}
248	catch (Exception e)
249	{
250	System.out.println("Could not append docType because: " + e);
251	}
252
253	// return the document containing a DocType
254	return docImpl;
255	}
256
257	/** returns the Node as a String */
258	public static String getString(Node xmlNode)
259	{
260	StringBuffer xmlRepresentation = new StringBuffer();
261	getString(xmlNode, xmlRepresentation, 0, false);
262	return xmlRepresentation.toString();
263	}
264
265	/**
266	* returns the node as a nicely formatted String - this introduces extra
267	* text nodes if the String is read back in as a DOM, so should only be used
268	* for printing
269	*/
270	public static String getPrettyString(Node xmlNode)
271	{
272	StringBuffer xmlRepresentation = new StringBuffer();
273	getString(xmlNode, xmlRepresentation, 0, true);
274	return xmlRepresentation.toString();
275	}
276
277	/*
278	* For the purposes of logger.debug statements, where this is called and
279	* hence outputted, returns an empty string if debugging is not enabled
280	*/
281	public static String getPrettyStringLogger(Node xmlNode, Logger log)
282	{
283
284	if (log.isDebugEnabled())
285	return getPrettyString(xmlNode);
286
287	return "";
288
289	}
290
291	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
292	{
293
294	if (xmlNode == null)
295	{
296	xmlRepresentation.append("<null>");
297	return;
298	}
299
300	short nodeType = xmlNode.getNodeType();
301	String nodeName = xmlNode.getNodeName();
302
303	if (nodeType == Node.DOCUMENT_NODE)
304	{
305	Document xmlDocNode = (Document) xmlNode;
306
307	//if (xmlDocNode.getDoctype() == null) {
308	//System.err.println("Doctype is null.");
309	//}
310	//else {
311	if (xmlDocNode.getDoctype() != null)
312	{
313	DocumentType dt = xmlDocNode.getDoctype();
314
315	String name = dt.getName();
316	String pid = dt.getPublicId();
317	String sid = dt.getSystemId();
318
319	// Use previously assigned name, not dt.getName() again
320	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
321
322	xmlRepresentation.append(doctype_str);
323	}
324	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
325	return;
326	}
327	// Handle Element nodes
328	if (nodeType == Node.ELEMENT_NODE)
329	{
330	if (pretty)
331	{
332	xmlRepresentation.append("\n");
333	for (int i = 0; i < depth; i++)
334	{
335	xmlRepresentation.append(" ");
336	}
337	}
338
339	// Write opening tag
340	xmlRepresentation.append("<");
341	xmlRepresentation.append(nodeName);
342
343	// Write the node attributes
344	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
345	for (int i = 0; i < nodeAttributes.getLength(); i++)
346	{
347	Node attribute = nodeAttributes.item(i);
348	xmlRepresentation.append(" ");
349	xmlRepresentation.append(attribute.getNodeName());
350	xmlRepresentation.append("=\"");
351	String attr_val = attribute.getNodeValue();
352
353	attr_val = attr_val.replaceAll("&","&");
354	attr_val = attr_val.replaceAll("<","<");
355	attr_val = attr_val.replaceAll(">",">");
356	attr_val = attr_val.replaceAll("\"",""");
357
358	// assume that any of the above chars that was already entity escaped
359	// was already correct => return back to how they were
360	attr_val = attr_val.replaceAll("&amp;","&");
361	attr_val = attr_val.replaceAll("&lt;","<");
362	attr_val = attr_val.replaceAll("&gt;",">");
363	attr_val = attr_val.replaceAll("&quot;",""");
364	attr_val = attr_val.replaceAll("&apos;","'");
365
366
367	xmlRepresentation.append(attr_val);
368	xmlRepresentation.append("\"");
369	}
370
371	// If the node has no children, close the opening tag and return
372	if (xmlNode.hasChildNodes() == false)
373	{
374	// This produces somewhat ugly output, but it is necessary to compensate
375	// for display bugs in Netscape. Firstly, the space is needed before the
376	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
377	// example). Also, a newline character would be expected after the tag,
378	// but this causes problems with the display of links (the link text
379	// will contain a newline character, which is displayed badly).
380	xmlRepresentation.append(" />");
381	return;
382	}
383
384	// Close the opening tag
385	xmlRepresentation.append(">");
386
387	// Process the children. We process text nodes here, but recursively process other nodes.
388	// hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
389	// Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
390	// also if these are inside a pre tag then the space shows up in the page.
391
392	NodeList children = xmlNode.getChildNodes();
393	boolean do_pretty = pretty;
394	boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
395	for (int i = 0; i < children.getLength(); i++)
396	{
397	Node child = children.item(i);
398	short child_type = child.getNodeType();
399	if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
400	if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
401	output_escaping = false;
402	}
403	else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
404	output_escaping = true;
405	}
406	else {
407	logger.warn("Unhandled processing instruction " + child.getNodeName());
408	}
409	}
410	else if (child_type == Node.TEXT_NODE) {
411	do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
412	// output the text
413	String text = child.getNodeValue();
414
415	// Perform output escaping, if required
416	// Apache Commons replace method is far superior to String.replaceAll - very fast!
417	if (output_escaping) {
418	text = StringUtils.replace(text, "&", "&");
419	text = StringUtils.replace(text, "<", "<");
420	text = StringUtils.replace(text, ">", ">");
421	text = StringUtils.replace(text, "'", "'");
422	text = StringUtils.replace(text, "\"", """);
423	}
424	// Remove any control-C characters
425	text = StringUtils.replace(text, "" + (char) 3, "");
426
427	xmlRepresentation.append(text);
428
429	}
430	else {
431	// recursively call getString
432	getString(child, xmlRepresentation, depth + 1, do_pretty);
433	}
434	} // foreach child of the element
435
436	// Write closing tag
437	if (pretty)
438	{
439	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
440	{
441	for (int i = 0; i < depth; i++)
442	xmlRepresentation.append(" ");
443	}
444	}
445	xmlRepresentation.append("</");
446	xmlRepresentation.append(nodeName);
447	xmlRepresentation.append(">");
448	if (pretty)
449	{
450	xmlRepresentation.append("\n");
451	}
452	} // ELEMENT_NODE
453
454	else if (nodeType == Node.COMMENT_NODE)
455	{
456	String text = xmlNode.getNodeValue();
457	xmlRepresentation.append("<!-- ");
458	xmlRepresentation.append(text);
459	xmlRepresentation.append(" -->");
460	}
461
462	// TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
463	// A type of node that is not handled yet
464	else
465	{
466	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
467	}
468
469	return;
470	}
471
472	protected static String getNodeTypeString(short node_type)
473	{
474
475	String type = "";
476	switch (node_type)
477	{
478	case Node.ATTRIBUTE_NODE:
479	type = "ATTRIBUTE_NODE";
480	break;
481	case Node.CDATA_SECTION_NODE:
482	type = "CDATA_SECTION_NODE";
483	break;
484	case Node.COMMENT_NODE:
485	type = "COMMENT_NODE";
486	break;
487	case Node.DOCUMENT_FRAGMENT_NODE:
488	type = "DOCUMENT_FRAGMENT_NODE";
489	break;
490	case Node.DOCUMENT_NODE:
491	type = "DOCUMENT_NODE";
492	break;
493	case Node.DOCUMENT_TYPE_NODE:
494	type = "DOCUMENT_TYPE_NODE";
495	break;
496	case Node.ELEMENT_NODE:
497	type = "ELEMENT_NODE";
498	break;
499	case Node.ENTITY_NODE:
500	type = "ENTITY_NODE";
501	break;
502	case Node.ENTITY_REFERENCE_NODE:
503	type = "ENTITY_REFERENCE_NODE";
504	break;
505	case Node.NOTATION_NODE:
506	type = "NOTATION_NODE";
507	break;
508	case Node.PROCESSING_INSTRUCTION_NODE:
509	type = "PROCESSING_INSTRUCTION_NODE";
510	break;
511	case Node.TEXT_NODE:
512	type = "TEXT_NODE";
513	break;
514	default:
515	type = "UNKNOWN";
516	}
517
518	return type;
519	}
520
521	// returns null if there no error occurred during parsing, or else returns the error message
522
523	// public String getParseErrorMessage()
524	// {
525	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
526	// return errorHandler.getErrorMessage();
527	// }
528
529	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
530	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
531	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
532	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
533	static public class ParseErrorHandler implements ErrorHandler
534	{
535	protected String errorMessage = null;
536
537	// Receive notification of a recoverable error.
538	public void error(SAXParseException exception)
539	{
540	handleError("Error:\n", exception);
541	}
542
543	// Receive notification of a non-recoverable error.
544	public void fatalError(SAXParseException exception)
545	{
546	handleError("Fatal Error:\n", exception);
547	}
548
549	// Receive notification of a warning.
550	public void warning(SAXParseException exception)
551	{
552	handleError("Warning:\n", exception);
553	}
554
555	public String toString(SAXParseException e)
556	{
557	String msg = e.getMessage();
558	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
559	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
560	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
561
562	return msg;
563	}
564
565	// clears the errorPage variable after first call to this method
566	public String getErrorMessage()
567	{
568	String errMsg = this.errorMessage;
569	if (this.errorMessage != null)
570	{
571	this.errorMessage = null;
572	}
573	return errMsg;
574	}
575
576	// sets the errorMessage member variable to the data stored in the exception
577	// and writes the errorMessage to the logger and tomcat's System.err
578	protected void handleError(String errorType, SAXParseException exception)
579	{
580	this.errorMessage = errorType + toString(exception);
581	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
582	logger.error(this.errorMessage);
583	}
584	}
585
586	public static boolean writeDOM(Element elem, File file) {
587
588	BufferedWriter writer = null;
589	boolean success = false;
590	try {
591	String xml_string = getString(elem);
592	// need createNewFile???
593	writer = new BufferedWriter(new FileWriter(file));
594	writer.write(xml_string);
595	success = true;
596	}
597
598	catch (Exception e) {
599	logger.error(e.getMessage());
600	success = false;
601	}
602	finally {
603	try {
604	if (writer != null) {
605	writer.close();
606	}
607	} catch(Exception e) {
608	logger.error("couldn't close the file"+e.getMessage());
609	}
610	}
611	return success;
612	}
613	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: