Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 34101

Last change on this file since 34101 was 34101, checked in by ak19, 4 years ago
Correcting syntax error and overlooked another instance of System.err.println() that should have been a logger.warn().
Property svn:keywords set to `Author Date Id Revision`
File size: 19.3 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.xml.sax.SAXNotRecognizedException;
34	import org.xml.sax.SAXNotSupportedException;
35	import org.apache.xerces.parsers.DOMParser;
36	import org.apache.xerces.dom.DocumentImpl; // for new Documents
37	import org.apache.xerces.dom.DocumentTypeImpl;
38
39	// other java classes
40	import java.io.BufferedWriter;
41	import java.io.ByteArrayInputStream;
42	import java.io.FileWriter;
43	import java.io.InputStream;
44	import java.io.Reader;
45	import java.io.InputStreamReader;
46	import java.io.StringReader;
47	import java.io.File;
48	import java.io.FileInputStream;
49	import java.io.FileReader;
50	import java.util.regex.*;
51
52	import org.apache.log4j.*;
53
54	// Apache Commons
55	import org.apache.commons.lang3.*;
56
57	import java.util.*;
58	import java.lang.reflect.*;
59
60	/**
61	* XMLConverter - utility class for greenstone
62	*
63	* generates new Documents
64	* parses XML Strings into Documents, converts Nodes to Strings
65	* different parsers have different behaviour - can experiment in here
66	* at the moment we only use xerces
67	* all xerces specific code is in here
68	*/
69	public class XMLConverter
70	{
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
73
74
75	public static final String XML_FILE_ENCODING="UTF-8";
76
77	static {
78	String fileEncodingProperty = System.getProperty("file.encoding");
79
80	// log a warning if the file.encoding Java property departs from XML_FILE_ENCODING,
81	// but always use the latter for reading in XML files in getDOM(File, ...) methods
82	if(fileEncodingProperty == null) {
83	logger.warn("file.encoding Java property was not set.");
84	logger.warn("Using XMLConverter.XML_FILE_ENCODING="+XML_FILE_ENCODING+"in XMLConverter.getDOM() calls.");
85	}
86	else if(!fileEncodingProperty.equals(XML_FILE_ENCODING)) {
87	logger.warn("file.encoding property passed in with JAVA_TOOLS_OPTIONS: "
88	+ fileEncodingProperty);
89	logger.warn("does not match XMLConverter.XML_FILE_ENCODING: " + XML_FILE_ENCODING);
90	logger.warn("Using XMLConverter.XML_FILE_ENCODING in XMLConverter.getDOM() calls.");
91	}
92	};
93
94
95	/** the no-args constructor */
96	public XMLConverter()
97	{
98
99	}
100
101	/** returns a DOM Document
102	* Question: why is this not simply calling the getDOM(String in, String encoding) version?
103	* Answer: Because the string, being a Java String, is already fully formed in unicode,
104	* so it doesn't need to call that other version.
105	*/
106	public static Document getDOM(String in)
107	{
108
109	try
110	{
111	Reader reader = new StringReader(in);
112	InputSource xml_source = new InputSource(reader);
113	Document doc = getDOM(xml_source, null);
114	reader.close();
115	return doc;
116
117	}
118	catch (Exception e)
119	{
120	logger.error(e.getMessage());
121	logger.error("Input string was:\n" + in);
122	e.printStackTrace();
123	}
124	return null;
125	}
126
127	/** returns a DOM Document */
128	public static Document getDOM(String in, String encoding)
129	{
130	try
131	{
132	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
133	InputSource xml_source = new InputSource(reader);
134	Document doc = getDOM(xml_source, null);
135	reader.close();
136	return doc;
137
138	}
139	catch (Exception e)
140	{
141	logger.error(e.getMessage());
142	logger.error("Input string was:\n" + in);
143	e.printStackTrace();
144	}
145	return null;
146	}
147
148	/** returns a DOM Document */
149	public static Document getDOM(File in) {
150
151	// now we're always going to explicitly use XML_FILE_ENCODING (UTF-8)
152	// as the encoding to read in a file unless otherwise specified.
153	return getDOM(in, XML_FILE_ENCODING, null);
154
155	/*
156	try
157	{
158	FileReader reader = new FileReader(in);
159	InputSource xml_source = new InputSource(reader);
160	Document doc = getDOM(xml_source, null);
161	reader.close();
162	return doc;
163
164	}
165	catch (Exception e)
166	{
167	logger.error(e.getMessage(), e);
168	logger.error("File was:\n" + in.getPath());
169	e.printStackTrace();
170
171	}
172	return null;
173	*/
174	}
175
176	public static Document getDOM(File in, String encoding) {
177	return getDOM(in, encoding, null);
178	}
179
180	/** returns a DOM document */
181	public static Document getDOM(File in, String encoding, EntityResolver er) {
182
183	try {
184
185
186	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
187	InputSource xml_source = new InputSource(isr);
188	Document doc = getDOM(xml_source, er);
189	isr.close();
190	return doc;
191
192	}
193	catch (Exception e)
194	{
195	logger.error(e.getMessage());
196	logger.error("File was:\n" + in.getPath());
197	e.printStackTrace();
198	}
199	return null;
200	}
201
202	public static Document getDOM(File in, EntityResolver er) {
203	// now we're always going to explicitly use XML_FILE_ENCODING (UTF-8)
204	// as the encoding to read in a file unless otherwise specified.
205	return getDOM(in, XML_FILE_ENCODING, er);
206	/*
207	try {
208	InputSource xml_source = new InputSource(new FileInputStream(in));
209	Document doc = getDOM(xml_source, er);
210	return doc;
211	}
212	catch (Exception e)
213	{
214	logger.error(e.getMessage());
215	logger.error("File was:\n" + in.getPath());
216	e.printStackTrace();
217	}
218	return null;
219	*/
220	}
221
222	public static Document getDOM(InputSource source, EntityResolver er) {
223
224	try {
225	DOMParser parser = new DOMParser();
226	parser.setFeature("http://xml.org/sax/features/validation", false);
227	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
228	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
229	// a performance test showed that having this on lead to increased
230	// memory use for small-medium docs, and not much gain for large
231	// docs.
232	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
233	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
234	// add an errorhandler to the parser which will output messages on encountering fatal errors, errors and warnings when parsing
235	parser.setErrorHandler(new ParseErrorHandler());
236	if (er != null) {
237	parser.setEntityResolver(er);
238	}
239	parser.parse(source);
240
241	Document doc = parser.getDocument();
242	return doc;
243
244	} catch (Exception e) {
245
246	logger.error(e.getMessage());
247	logger.error("InputSource:\n");
248	e.printStackTrace();
249	}
250	return null;
251
252	}
253
254	/** creates a new empty DOM Document */
255	public static Document newDOM()
256	{
257	Document doc = new DocumentImpl();
258	return doc;
259	}
260
261	/**
262	* This method's parameters represent the parts of the Doctype of this
263	* Document that is to be created. For more info see
264	* http://xerces.apache.org
265	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
266	* .html#DocumentTypeImpl
267	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
268	*
269	* */
270	public static Document newDOM(String qualifiedName, String publicID, String systemID)
271	{
272	// create empty DOM document
273	DocumentImpl docImpl = new DocumentImpl();
274
275	// Need to use the document to create the docType for it
276	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
277
278	// Although we have created the docType using the document, we need to still
279	// put it into the empty document we just created
280	try
281	{
282	docImpl.appendChild(myDocType);
283	}
284	catch (Exception e)
285	{
286	System.out.println("Could not append docType because: " + e);
287	}
288
289	// return the document containing a DocType
290	return docImpl;
291	}
292
293	/** returns the Node as a String */
294	public static String getString(Node xmlNode)
295	{
296	StringBuffer xmlRepresentation = new StringBuffer();
297	getString(xmlNode, xmlRepresentation, 0, false);
298	return xmlRepresentation.toString();
299	}
300
301	/**
302	* returns the node as a nicely formatted String - this introduces extra
303	* text nodes if the String is read back in as a DOM, so should only be used
304	* for printing
305	*/
306	public static String getPrettyString(Node xmlNode)
307	{
308	StringBuffer xmlRepresentation = new StringBuffer();
309	getString(xmlNode, xmlRepresentation, 0, true);
310	return xmlRepresentation.toString();
311	}
312
313	/*
314	* For the purposes of logger.debug statements, where this is called and
315	* hence outputted, returns an empty string if debugging is not enabled
316	*/
317	public static String getPrettyStringLogger(Node xmlNode, Logger log)
318	{
319
320	if (log.isDebugEnabled())
321	return getPrettyString(xmlNode);
322
323	return "";
324
325	}
326
327	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
328	{
329
330	if (xmlNode == null)
331	{
332	xmlRepresentation.append("<null>");
333	return;
334	}
335
336	short nodeType = xmlNode.getNodeType();
337	String nodeName = xmlNode.getNodeName();
338
339	if (nodeType == Node.DOCUMENT_NODE)
340	{
341	Document xmlDocNode = (Document) xmlNode;
342
343	//if (xmlDocNode.getDoctype() == null) {
344	//System.err.println("Doctype is null.");
345	//}
346	//else {
347	if (xmlDocNode.getDoctype() != null)
348	{
349	DocumentType dt = xmlDocNode.getDoctype();
350
351	String name = dt.getName();
352	String pid = dt.getPublicId();
353	String sid = dt.getSystemId();
354
355	// Use previously assigned name, not dt.getName() again
356	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
357
358	xmlRepresentation.append(doctype_str);
359	}
360	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
361	return;
362	}
363	// Handle Element nodes
364	if (nodeType == Node.ELEMENT_NODE)
365	{
366	if (pretty)
367	{
368	xmlRepresentation.append("\n");
369	for (int i = 0; i < depth; i++)
370	{
371	xmlRepresentation.append(" ");
372	}
373	}
374
375	// Write opening tag
376	xmlRepresentation.append("<");
377	xmlRepresentation.append(nodeName);
378
379	// Write the node attributes
380	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
381	for (int i = 0; i < nodeAttributes.getLength(); i++)
382	{
383	Node attribute = nodeAttributes.item(i);
384	xmlRepresentation.append(" ");
385	xmlRepresentation.append(attribute.getNodeName());
386	xmlRepresentation.append("=\"");
387	String attr_val = attribute.getNodeValue();
388
389	attr_val = attr_val.replaceAll("&","&");
390	attr_val = attr_val.replaceAll("<","<");
391	attr_val = attr_val.replaceAll(">",">");
392	attr_val = attr_val.replaceAll("\"",""");
393
394	// assume that any of the above chars that was already entity escaped
395	// was already correct => return back to how they were
396	attr_val = attr_val.replaceAll("&amp;","&");
397	attr_val = attr_val.replaceAll("&lt;","<");
398	attr_val = attr_val.replaceAll("&gt;",">");
399	attr_val = attr_val.replaceAll("&quot;",""");
400	attr_val = attr_val.replaceAll("&apos;","'");
401
402
403	xmlRepresentation.append(attr_val);
404	xmlRepresentation.append("\"");
405	}
406
407	// If the node has no children, close the opening tag and return
408	if (xmlNode.hasChildNodes() == false)
409	{
410	// This produces somewhat ugly output, but it is necessary to compensate
411	// for display bugs in Netscape. Firstly, the space is needed before the
412	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
413	// example). Also, a newline character would be expected after the tag,
414	// but this causes problems with the display of links (the link text
415	// will contain a newline character, which is displayed badly).
416	xmlRepresentation.append(" />");
417	return;
418	}
419
420	// Close the opening tag
421	xmlRepresentation.append(">");
422
423	// Process the children. We process text nodes here, but recursively process other nodes.
424	// hack for nodes next to text nodes - dont make them pretty, ie don'e do any new lines or indenting
425	// Usually text nodes will be inside their own element. Sometimes we have eg span tags next to text nodes - don't want those indented.
426	// also if these are inside a pre tag then the space shows up in the page.
427
428	NodeList children = xmlNode.getChildNodes();
429	boolean do_pretty = pretty;
430	boolean output_escaping = true; // record if we have encountered a disable-output-escaping instruction
431	for (int i = 0; i < children.getLength(); i++)
432	{
433	Node child = children.item(i);
434	short child_type = child.getNodeType();
435	if (child_type == Node.PROCESSING_INSTRUCTION_NODE) {
436	if (child.getNodeName().equals("javax.xml.transform.disable-output-escaping")) {
437	output_escaping = false;
438	}
439	else if (child.getNodeName().equals("javax.xml.transform.enable-output-escaping")) {
440	output_escaping = true;
441	}
442	else {
443	logger.warn("Unhandled processing instruction " + child.getNodeName());
444	}
445	}
446	else if (child_type == Node.TEXT_NODE) {
447	do_pretty = false; // if there is a text node amongst the children, do all the following nodes in non-pretty mode - hope this doesn't stuff up something else
448	// output the text
449	String text = child.getNodeValue();
450
451	// Perform output escaping, if required
452	// Apache Commons replace method is far superior to String.replaceAll - very fast!
453	if (output_escaping) {
454	text = StringUtils.replace(text, "&", "&");
455	text = StringUtils.replace(text, "<", "<");
456	text = StringUtils.replace(text, ">", ">");
457	text = StringUtils.replace(text, "'", "'");
458	text = StringUtils.replace(text, "\"", """);
459	}
460	// Remove any control-C characters
461	text = StringUtils.replace(text, "" + (char) 3, "");
462
463	xmlRepresentation.append(text);
464
465	}
466	else {
467	// recursively call getString
468	getString(child, xmlRepresentation, depth + 1, do_pretty);
469	}
470	} // foreach child of the element
471
472	// Write closing tag
473	if (pretty)
474	{
475	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
476	{
477	for (int i = 0; i < depth; i++)
478	xmlRepresentation.append(" ");
479	}
480	}
481	xmlRepresentation.append("</");
482	xmlRepresentation.append(nodeName);
483	xmlRepresentation.append(">");
484	if (pretty)
485	{
486	xmlRepresentation.append("\n");
487	}
488	} // ELEMENT_NODE
489
490	else if (nodeType == Node.COMMENT_NODE)
491	{
492	String text = xmlNode.getNodeValue();
493	xmlRepresentation.append("<!-- ");
494	xmlRepresentation.append(text);
495	xmlRepresentation.append(" -->");
496	}
497
498	// TEXT and PROCESSING_INSTRUCTION nodes are handled inside their containing element node
499	// A type of node that is not handled yet
500	else
501	{
502	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
503	}
504
505	return;
506	}
507
508	protected static String getNodeTypeString(short node_type)
509	{
510
511	String type = "";
512	switch (node_type)
513	{
514	case Node.ATTRIBUTE_NODE:
515	type = "ATTRIBUTE_NODE";
516	break;
517	case Node.CDATA_SECTION_NODE:
518	type = "CDATA_SECTION_NODE";
519	break;
520	case Node.COMMENT_NODE:
521	type = "COMMENT_NODE";
522	break;
523	case Node.DOCUMENT_FRAGMENT_NODE:
524	type = "DOCUMENT_FRAGMENT_NODE";
525	break;
526	case Node.DOCUMENT_NODE:
527	type = "DOCUMENT_NODE";
528	break;
529	case Node.DOCUMENT_TYPE_NODE:
530	type = "DOCUMENT_TYPE_NODE";
531	break;
532	case Node.ELEMENT_NODE:
533	type = "ELEMENT_NODE";
534	break;
535	case Node.ENTITY_NODE:
536	type = "ENTITY_NODE";
537	break;
538	case Node.ENTITY_REFERENCE_NODE:
539	type = "ENTITY_REFERENCE_NODE";
540	break;
541	case Node.NOTATION_NODE:
542	type = "NOTATION_NODE";
543	break;
544	case Node.PROCESSING_INSTRUCTION_NODE:
545	type = "PROCESSING_INSTRUCTION_NODE";
546	break;
547	case Node.TEXT_NODE:
548	type = "TEXT_NODE";
549	break;
550	default:
551	type = "UNKNOWN";
552	}
553
554	return type;
555	}
556
557	// returns null if there no error occurred during parsing, or else returns the error message
558
559	// public String getParseErrorMessage()
560	// {
561	// ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
562	// return errorHandler.getErrorMessage();
563	// }
564
565	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
566	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
567	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
568	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
569	static public class ParseErrorHandler implements ErrorHandler
570	{
571	protected String errorMessage = null;
572
573	// Receive notification of a recoverable error.
574	public void error(SAXParseException exception)
575	{
576	handleError("Error:\n", exception);
577	}
578
579	// Receive notification of a non-recoverable error.
580	public void fatalError(SAXParseException exception)
581	{
582	handleError("Fatal Error:\n", exception);
583	}
584
585	// Receive notification of a warning.
586	public void warning(SAXParseException exception)
587	{
588	handleError("Warning:\n", exception);
589	}
590
591	public String toString(SAXParseException e)
592	{
593	String msg = e.getMessage();
594	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
595	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
596	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
597
598	return msg;
599	}
600
601	// clears the errorPage variable after first call to this method
602	public String getErrorMessage()
603	{
604	String errMsg = this.errorMessage;
605	if (this.errorMessage != null)
606	{
607	this.errorMessage = null;
608	}
609	return errMsg;
610	}
611
612	// sets the errorMessage member variable to the data stored in the exception
613	// and writes the errorMessage to the logger and tomcat's System.err
614	protected void handleError(String errorType, SAXParseException exception)
615	{
616	this.errorMessage = errorType + toString(exception);
617	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
618	logger.error(this.errorMessage);
619	}
620	}
621
622	public static boolean writeDOM(Element elem, File file) {
623
624	BufferedWriter writer = null;
625	boolean success = false;
626	try {
627	String xml_string = getString(elem);
628	// need createNewFile???
629	writer = new BufferedWriter(new FileWriter(file));
630	writer.write(xml_string);
631	success = true;
632	}
633
634	catch (Exception e) {
635	logger.error(e.getMessage());
636	success = false;
637	}
638	finally {
639	try {
640	if (writer != null) {
641	writer.close();
642	}
643	} catch(Exception e) {
644	logger.error("couldn't close the file"+e.getMessage());
645	}
646	}
647	return success;
648	}
649	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: