Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 34112

Last change on this file since 34112 was 34112, checked in by ak19, 4 years ago
GS3 source code seems to already use FileInputStream with UTF-8 encoding where XML files are concerned. Refactored XML_FILE_ENCODING into GSConstants class so both XMLConverter and now XMLTransformer can use it.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.8 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// import constant GSConstants.XML_FILE_ENCODING, so this file can use it as XML_FILE_ENCODING directly
22	//import static org.greenstone.gsdl3.util.GSConstants.XML_FILE_ENCODING;
23
24	// XML classes
25	import org.w3c.dom.DOMImplementation;
26	import org.w3c.dom.Document;
27	import org.w3c.dom.DocumentType;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.NodeList;
31	import org.w3c.dom.NamedNodeMap;
32	import org.xml.sax.InputSource;
33	import org.xml.sax.EntityResolver;
34	import org.xml.sax.ErrorHandler;
35	import org.xml.sax.SAXParseException;
36	import org.xml.sax.SAXNotRecognizedException;
37	import org.xml.sax.SAXNotSupportedException;
38	import org.apache.xerces.parsers.DOMParser;
39	import org.apache.xerces.dom.DocumentImpl; // for new Documents
40	import org.apache.xerces.dom.DocumentTypeImpl;
41
42	// other java classes
43	import java.io.BufferedWriter;
44	import java.io.ByteArrayInputStream;
45	import java.io.FileWriter;
46	import java.io.InputStream;
47	import java.io.Reader;
48	import java.io.InputStreamReader;
49	import java.io.StringReader;
50	import java.io.File;
51	import java.io.FileInputStream;
52	import java.io.FileReader;
53	import java.util.regex.*;
54
55	import org.apache.log4j.*;
56
57	// Apache Commons
58	import org.apache.commons.lang3.*;
59
60	import java.util.*;
61	import java.lang.reflect.*;
62
63	/**
64	* XMLConverter - utility class for greenstone
65	*
66	* generates new Documents
67	* parses XML Strings into Documents, converts Nodes to Strings
68	* different parsers have different behaviour - can experiment in here
69	* at the moment we only use xerces
70	* all xerces specific code is in here
71	*/
72	public class XMLConverter
73	{
74
75	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
76
77
78
79
80	static {
81	String fileEncodingProperty = System.getProperty("file.encoding");
82
83	// log a warning if the file.encoding Java property departs from GSConstants.XML_FILE_ENCODING,
84	// but always use the latter for reading in XML files in getDOM(File, ...) methods
85
86	// Stopped passing around JAVA_TOOL_OPTIONS="-Dfile.encoding=UTF-8" again.
87	// So this is likely to be null by default. Don't want to see the warnings about it being null.
88	/*if(fileEncodingProperty == null) {
89	logger.warn("file.encoding Java property was not set.");
90	logger.warn("Using XMLConverter.GSConstants.XML_FILE_ENCODING="+GSConstants.XML_FILE_ENCODING+"in XMLConverter.getDOM() calls.");
91	}
92	else if(!fileEncodingProperty.equals(GSConstants.XML_FILE_ENCODING)) {
93	*/
94	if(fileEncodingProperty != null && !fileEncodingProperty.equals(GSConstants.XML_FILE_ENCODING)) {
95	logger.warn("file.encoding property was passed in with JAVA_TOOLS_OPTIONS as: "
96	+ fileEncodingProperty);
97	logger.warn("This does not match XMLConverter.GSConstants.XML_FILE_ENCODING: " + GSConstants.XML_FILE_ENCODING);
98	logger.warn("Using XMLConverter.GSConstants.XML_FILE_ENCODING in XMLConverter.getDOM() and XMLTransformer calls.");
99	}
100	};
101
102
103	/** the no-args constructor */
104	public XMLConverter()
105	{
106
107	}
108
109	/** returns a DOM Document
110	* Question: why is this not simply calling the getDOM(String in, String encoding) version?
111	* Answer: Because the string, being a Java String, is already fully formed in unicode,
112	* so it doesn't need to call that other version.
113	*/
114	public static Document getDOM(String in)
115	{
116
117	try
118	{
119	Reader reader = new StringReader(in);
120	InputSource xml_source = new InputSource(reader);
121	Document doc = getDOM(xml_source, null);
122	reader.close();
123	return doc;
124
125	}
126	catch (Exception e)
127	{
128	logger.error(e.getMessage());
129	logger.error("Input string was:\n" + in);
130	e.printStackTrace();
131	}
132	return null;
133	}
134
135	/** returns a DOM Document */
136	public static Document getDOM(String in, String encoding)
137	{
138	try
139	{
140	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
141	InputSource xml_source = new InputSource(reader);
142	Document doc = getDOM(xml_source, null);
143	reader.close();
144	return doc;
145
146	}
147	catch (Exception e)
148	{
149	logger.error(e.getMessage());
150	logger.error("Input string was:\n" + in);
151	e.printStackTrace();
152	}
153	return null;
154	}
155
156	/** returns a DOM Document */
157	public static Document getDOM(File in) {
158
159	// now we're always going to explicitly use GSConstants.XML_FILE_ENCODING (UTF-8)
160	// as the encoding to read in a file unless otherwise specified.
161	return getDOM(in, GSConstants.XML_FILE_ENCODING, null);
162
163	/*
164	try
165	{
166	FileReader reader = new FileReader(in);
167	InputSource xml_source = new InputSource(reader);
168	Document doc = getDOM(xml_source, null);
169	reader.close();
170	return doc;
171
172	}
173	catch (Exception e)
174	{
175	logger.error(e.getMessage(), e);
176	logger.error("File was:\n" + in.getPath());
177	e.printStackTrace();
178
179	}
180	return null;
181	*/
182	}
183
184	public static Document getDOM(File in, String encoding) {
185	return getDOM(in, encoding, null);
186	}
187
188	/** returns a DOM document */
189	public static Document getDOM(File in, String encoding, EntityResolver er) {
190
191	try {
192
193
194	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
195	InputSource xml_source = new InputSource(isr);
196	Document doc = getDOM(xml_source, er);
197	isr.close();
198	return doc;
199
200	}
201	catch (Exception e)
202	{
203	logger.error(e.getMessage());
204	logger.error("File was:\n" + in.getPath());
205	e.printStackTrace();
206	}
207	return null;
208	}
209
210	public static Document getDOM(File in, EntityResolver er) {
211	// now we're always going to explicitly use GSConstants.XML_FILE_ENCODING (UTF-8)
212	// as the encoding to read in a file unless otherwise specified.
213	return getDOM(in, GSConstants.XML_FILE_ENCODING, er);
214	/*
215	try {
216	InputSource xml_source = new InputSource(new FileInputStream(in));
217	Document doc = getDOM(xml_source, er);
218	return doc;
219	}
220	catch (Exception e)
221	{
222	logger.error(e.getMessage());
223	logger.error("File was:\n" + in.getPath());
224	e.printStackTrace();
225	}
226	return null;
227	*/
228	}
229
230	public static Document getDOM(InputSource source, EntityResolver er) {
231
232	try {
233	DOMParser parser = new DOMParser();
234	parser.setFeature("http://xml.org/sax/features/validation", false);
235	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
236	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
237	// a performance test showed that having this on lead to increased
238	// memory use for small-medium docs, and not much gain for large
239	// docs.
240	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
241	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
242	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
243	parser.setErrorHandler(new ParseErrorHandler());
244	if (er != null) {
245	parser.setEntityResolver(er);
246	}
247	parser.parse(source);
248
249	Document doc = parser.getDocument();
250	return doc;
251
252	} catch (Exception e) {
253
254	logger.error(e.getMessage());
255	logger.error("InputSource:\n");
256	e.printStackTrace();
257	}
258	return null;
259
260	}
261
262	/** creates a new empty DOM Document */
263	public static Document newDOM()
264	{
265	Document doc = new DocumentImpl();
266	return doc;
267	}
268
269	/**
270	* This method's parameters represent the parts of the Doctype of this
271	* Document that is to be created. For more info see
272	* http://xerces.apache.org
273	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
274	* .html#DocumentTypeImpl
275	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
276	*
277	* */
278	public static Document newDOM(String qualifiedName, String publicID, String systemID)
279	{
280	// create empty DOM document
281	DocumentImpl docImpl = new DocumentImpl();
282
283	// Need to use the document to create the docType for it
284	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
285
286	// Although we have created the docType using the document, we need to still
287	// put it into the empty document we just created
288	try
289	{
290	docImpl.appendChild(myDocType);
291	}
292	catch (Exception e)
293	{
294	System.out.println("Could not append docType because: " + e);
295	}
296
297	// return the document containing a DocType
298	return docImpl;
299	}
300
301	/** returns the Node as a String */
302	public static String getString(Node xmlNode)
303	{
304	StringBuffer xmlRepresentation = new StringBuffer();
305	getString(xmlNode, xmlRepresentation, 0, false);
306	return xmlRepresentation.toString();
307	}
308
309	/**
310	* returns the node as a nicely formatted String - this introduces extra
311	* text nodes if the String is read back in as a DOM, so should only be used
312	* for printing
313	*/
314	public static String getPrettyString(Node xmlNode)
315	{
316	StringBuffer xmlRepresentation = new StringBuffer();
317	getString(xmlNode, xmlRepresentation, 0, true);
318	return xmlRepresentation.toString();
319	}
320
321	/*
322	* For the purposes of logger.debug statements, where this is called and
323	* hence outputted, returns an empty string if debugging is not enabled
324	*/
325	public static String getPrettyStringLogger(Node xmlNode, Logger log)
326	{
327
328	if (log.isDebugEnabled())
329	return getPrettyString(xmlNode);
330
331	return "";
332
333	}
334
335	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
336	{
337
338	if (xmlNode == null)
339	{
340	xmlRepresentation.append("<null>");
341	return;
342	}
343
344	short nodeType = xmlNode.getNodeType();
345	String nodeName = xmlNode.getNodeName();
346
347	if (nodeType == Node.DOCUMENT_NODE)
348	{
349	Document xmlDocNode = (Document) xmlNode;
350
351	//if (xmlDocNode.getDoctype() == null) {
352	//System.err.println("Doctype is null.");
353	//}
354	//else {
355	if (xmlDocNode.getDoctype() != null)
356	{
357	DocumentType dt = xmlDocNode.getDoctype();
358
359	String name = dt.getName();
360	String pid = dt.getPublicId();
361	String sid = dt.getSystemId();
362
363	// Use previously assigned name, not dt.getName() again
364	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
365
366	xmlRepresentation.append(doctype_str);
367	}
368	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
369	return;
370	}
371	// Handle Element nodes
372	if (nodeType == Node.ELEMENT_NODE)
373	{
374	if (pretty)
375	{
376	xmlRepresentation.append("\n");
377	for (int i = 0; i < depth; i++)
378	{
379	xmlRepresentation.append(" ");
380	}
381	}
382
383	// Write opening tag
384	xmlRepresentation.append("<");
385	xmlRepresentation.append(nodeName);
386
387	// Write the node attributes
388	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
389	for (int i = 0; i < nodeAttributes.getLength(); i++)
390	{
391	Node attribute = nodeAttributes.item(i);
392	xmlRepresentation.append(" ");
393	xmlRepresentation.append(attribute.getNodeName());
394	xmlRepresentation.append("=\"");
395	String attr_val = attribute.getNodeValue();
396
397	attr_val = attr_val.replaceAll("&","&");
398	attr_val = attr_val.replaceAll("<","<");
399	attr_val = attr_val.replaceAll(">",">");
400	attr_val = attr_val.replaceAll("\"",""");
401
402	// assume that any of the above chars that was already entity escaped
403	// was already correct => return back to how they were
404	attr_val = attr_val.replaceAll("&amp;","&");
405	attr_val = attr_val.replaceAll("&lt;","<");
406	attr_val = attr_val.replaceAll("&gt;",">");
407	attr_val = attr_val.replaceAll("&quot;",""");
408	attr_val = attr_val.replaceAll("&apos;","'");
409
410
411	xmlRepresentation.append(attr_val);
412	xmlRepresentation.append("\"");
413	}
414
415	// If the node has no children, close the opening tag and return
416	if (xmlNode.hasChildNodes() == false)
417	{
418	// This produces somewhat ugly output, but it is necessary to compensate
419	// for display bugs in Netscape. Firstly, the space is needed before the
420	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
421	// example). Also, a newline character would be expected after the tag,
422	// but this causes problems with the display of links (the link text
423	// will contain a newline character, which is displayed badly).
424	xmlRepresentation.append(" />");
425	return;
426	}
427
428	// Close the opening tag
429	xmlRepresentation.append(">");
430
431	// Process the children. We process text nodes here, but recursively process other nodes.
432	// hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
433	// Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
434	// also if these are inside a pre tag then the space shows up in the page.
435
436	NodeList children = xmlNode.getChildNodes();
437	boolean do_pretty = pretty;
438	boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
439	for (int i = 0; i < children.getLength(); i++)
440	{
441	Node child = children.item(i);
442	short child_type = child.getNodeType();
443	if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
444	if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
445	output_escaping = false;
446	}
447	else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
448	output_escaping = true;
449	}
450	else {
451	logger.warn("Unhandled processing instruction " + child.getNodeName());
452	}
453	}
454	else if (child_type == Node.TEXT_NODE) {
455	do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
456	// output the text
457	String text = child.getNodeValue();
458
459	// Perform output escaping, if required
460	// Apache Commons replace method is far superior to String.replaceAll - very fast!
461	if (output_escaping) {
462	text = StringUtils.replace(text, "&", "&");
463	text = StringUtils.replace(text, "<", "<");
464	text = StringUtils.replace(text, ">", ">");
465	text = StringUtils.replace(text, "'", "'");
466	text = StringUtils.replace(text, "\"", """);
467	}
468	// Remove any control-C characters
469	text = StringUtils.replace(text, "" + (char) 3, "");
470
471	xmlRepresentation.append(text);
472
473	}
474	else {
475	// recursively call getString
476	getString(child, xmlRepresentation, depth + 1, do_pretty);
477	}
478	} // foreach child of the element
479
480	// Write closing tag
481	if (pretty)
482	{
483	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
484	{
485	for (int i = 0; i < depth; i++)
486	xmlRepresentation.append(" ");
487	}
488	}
489	xmlRepresentation.append("</");
490	xmlRepresentation.append(nodeName);
491	xmlRepresentation.append(">");
492	if (pretty)
493	{
494	xmlRepresentation.append("\n");
495	}
496	} // ELEMENT_NODE
497
498	else if (nodeType == Node.COMMENT_NODE)
499	{
500	String text = xmlNode.getNodeValue();
501	xmlRepresentation.append("<!-- ");
502	xmlRepresentation.append(text);
503	xmlRepresentation.append(" -->");
504	}
505
506	// TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
507	// A type of node that is not handled yet
508	else
509	{
510	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
511	}
512
513	return;
514	}
515
516	protected static String getNodeTypeString(short node_type)
517	{
518
519	String type = "";
520	switch (node_type)
521	{
522	case Node.ATTRIBUTE_NODE:
523	type = "ATTRIBUTE_NODE";
524	break;
525	case Node.CDATA_SECTION_NODE:
526	type = "CDATA_SECTION_NODE";
527	break;
528	case Node.COMMENT_NODE:
529	type = "COMMENT_NODE";
530	break;
531	case Node.DOCUMENT_FRAGMENT_NODE:
532	type = "DOCUMENT_FRAGMENT_NODE";
533	break;
534	case Node.DOCUMENT_NODE:
535	type = "DOCUMENT_NODE";
536	break;
537	case Node.DOCUMENT_TYPE_NODE:
538	type = "DOCUMENT_TYPE_NODE";
539	break;
540	case Node.ELEMENT_NODE:
541	type = "ELEMENT_NODE";
542	break;
543	case Node.ENTITY_NODE:
544	type = "ENTITY_NODE";
545	break;
546	case Node.ENTITY_REFERENCE_NODE:
547	type = "ENTITY_REFERENCE_NODE";
548	break;
549	case Node.NOTATION_NODE:
550	type = "NOTATION_NODE";
551	break;
552	case Node.PROCESSING_INSTRUCTION_NODE:
553	type = "PROCESSING_INSTRUCTION_NODE";
554	break;
555	case Node.TEXT_NODE:
556	type = "TEXT_NODE";
557	break;
558	default:
559	type = "UNKNOWN";
560	}
561
562	return type;
563	}
564
565	// returns null if there no error occurred during parsing, or else returns the error message
566
567	// public String getParseErrorMessage()
568	// {
569	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
570	// return errorHandler.getErrorMessage();
571	// }
572
573	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
574	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
575	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
576	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
577	static public class ParseErrorHandler implements ErrorHandler
578	{
579	protected String errorMessage = null;
580
581	// Receive notification of a recoverable error.
582	public void error(SAXParseException exception)
583	{
584	handleError("Error:\n", exception);
585	}
586
587	// Receive notification of a non-recoverable error.
588	public void fatalError(SAXParseException exception)
589	{
590	handleError("Fatal Error:\n", exception);
591	}
592
593	// Receive notification of a warning.
594	public void warning(SAXParseException exception)
595	{
596	handleError("Warning:\n", exception);
597	}
598
599	public String toString(SAXParseException e)
600	{
601	String msg = e.getMessage();
602	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
603	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
604	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
605
606	return msg;
607	}
608
609	// clears the errorPage variable after first call to this method
610	public String getErrorMessage()
611	{
612	String errMsg = this.errorMessage;
613	if (this.errorMessage != null)
614	{
615	this.errorMessage = null;
616	}
617	return errMsg;
618	}
619
620	// sets the errorMessage member variable to the data stored in the exception
621	// and writes the errorMessage to the logger and tomcat's System.err
622	protected void handleError(String errorType, SAXParseException exception)
623	{
624	this.errorMessage = errorType + toString(exception);
625	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
626	logger.error(this.errorMessage);
627	}
628	}
629
630	public static boolean writeDOM(Element elem, File file) {
631
632	BufferedWriter writer = null;
633	boolean success = false;
634	try {
635	String xml_string = getString(elem);
636	// need createNewFile???
637	writer = new BufferedWriter(new FileWriter(file));
638	writer.write(xml_string);
639	success = true;
640	}
641
642	catch (Exception e) {
643	logger.error(e.getMessage());
644	success = false;
645	}
646	finally {
647	try {
648	if (writer != null) {
649	writer.close();
650	}
651	} catch(Exception e) {
652	logger.error("couldn't close the file"+e.getMessage());
653	}
654	}
655	return success;
656	}
657	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: