Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 28382

Last change on this file since 28382 was 28382, checked in by davidb, 11 years ago
Elimination of the 'this.doc' field from the Action baseclass and the subclasses that rely on it. For Greenstone3 purposes it is unsafe to create this object in the constructor to the action and then store it for other methods to access. This is because the Greenstone 3 (and in particular calls to 'process' operate in a multi-threaded context, that is managed by the Servlet server (e.g. Tomcat by default). Calls to DOM methods are not guaranteed to be thread safe, this became apparent when we started looking in to an exception that was being thrown, and centred around use of the DOM method 'item(i)'. The change this commit makes is to remove 'this.doc' being stored as a field. A document is now created in the top level of a call to 'process()' and when a DOM reference is needed in a subsequent method an Element variable (typically passed in as a parameter to the method) is used (through 'Document doc = element.getOwnerDocument()') to gain access to the DOM
Property svn:keywords set to `Author Date Id Revision`
File size: 17.4 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.apache.xerces.parsers.DOMParser;
34	import org.apache.xerces.dom.*; // for new Documents
35
36	// other java classes
37	import java.io.ByteArrayInputStream;
38	import java.io.InputStream;
39	import java.io.Reader;
40	import java.io.InputStreamReader;
41	import java.io.StringReader;
42	import java.io.File;
43	import java.io.FileInputStream;
44	import java.io.FileReader;
45	import java.util.regex.*;
46
47	import org.apache.log4j.*;
48
49	// Apache Commons
50	import org.apache.commons.lang3.*;
51
52	import java.util.*;
53	import java.lang.reflect.*;
54
55	/**
56	* XMLConverter - utility class for greenstone
57	*
58	* parses XML Strings into Documents, converts Nodes to Strings different
59	* parsers have different behaviour - can experiment in here now we only use
60	* xerces
61	*
62	*/
63	public class XMLConverter
64	{
65
66	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
67
68	protected EntityResolver resolver = null;
69
70	/** xerces parser */
71	protected DOMParser parser = null;
72
73	private static boolean outputEscaping = true;
74
75	/** the no-args constructor */
76	public XMLConverter()
77	{
78	try
79	{
80	this.parser = new DOMParser();
81	this.parser.setFeature("http://xml.org/sax/features/validation", false);
82	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
83	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
84	// a performance test showed that having this on lead to increased
85	// memory use for small-medium docs, and not much gain for large
86	// docs.
87	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
88	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
89	// add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
90	// this errormessage can then be converted to xhtml and displayed in a browser.
91	this.parser.setErrorHandler(new ParseErrorHandler());
92	}
93	catch (Exception e)
94	{
95	logger.error(e.getMessage());
96	}
97	}
98
99	/** sets the entity resolver. pass in null to unset it */
100	public void setEntityResolver(EntityResolver er)
101	{
102	this.resolver = er;
103	this.parser.setEntityResolver(er);
104	}
105
106	/**
107	* Given a Node representing an Element or Document, will return the
108	* Element/docroot Element. Returns null if the Node was not an element.
109	*/
110	public static Element nodeToElement(Node node)
111	{
112	if (node == null)
113	{
114	return null;
115	}
116	short nodeType = node.getNodeType();
117
118	if (nodeType == Node.DOCUMENT_NODE)
119	{
120	Document docNode = (Document) node;
121	return docNode.getDocumentElement();
122	}
123	else if (nodeType == Node.ELEMENT_NODE)
124	{
125	return (Element) node;
126	}
127	else
128	{
129	String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
130	System.err.println(message);
131	logger.warn(message);
132	return null;
133	}
134	}
135
136	/** returns a DOM Document */
137	public Document getDOM(String in)
138	{
139
140	try
141	{
142	Reader reader = new StringReader(in);
143	InputSource xml_source = new InputSource(reader);
144
145	DOMParser parser = new DOMParser();
146	parser.setFeature("http://xml.org/sax/features/validation", false);
147	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
148	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
149	if (resolver != null)
150	{
151	parser.setEntityResolver(this.resolver);
152	}
153	parser.setErrorHandler(new ParseErrorHandler());
154	parser.parse(xml_source);
155
156	Document doc = parser.getDocument();
157
158	reader.close();
159	return doc;
160
161	}
162	catch (Exception e)
163	{
164	logger.error(e.getMessage());
165	}
166	return null;
167	}
168
169	/** returns a DOM Document */
170	public Document getDOM(String in, String encoding)
171	{
172	try
173	{
174	InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(in.getBytes(encoding)), encoding);
175	InputSource xml_source = new InputSource(reader);
176
177	DOMParser parser = new DOMParser();
178	parser.setFeature("http://xml.org/sax/features/validation", false);
179	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
180	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
181	if (resolver != null)
182	{
183	parser.setEntityResolver(this.resolver);
184	}
185	parser.setErrorHandler(new ParseErrorHandler());
186	parser.parse(xml_source);
187
188	Document doc = parser.getDocument();
189
190	reader.close();
191	return doc;
192
193	}
194	catch (Exception e)
195	{
196	logger.error(e.getMessage());
197	}
198	return null;
199	}
200
201	/** returns a DOM Document */
202	public Document getDOM(File in)
203	{
204	try
205	{
206	FileReader reader = new FileReader(in);
207	InputSource xml_source = new InputSource(reader);
208
209	DOMParser parser = new DOMParser();
210	parser.setFeature("http://xml.org/sax/features/validation", false);
211	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
212	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
213	if (resolver != null)
214	{
215	parser.setEntityResolver(this.resolver);
216	}
217	parser.setErrorHandler(new ParseErrorHandler());
218	parser.parse(xml_source);
219
220	Document doc = parser.getDocument();
221
222	reader.close();
223	return doc;
224
225	}
226	catch (Exception e)
227	{
228	logger.error(e.getMessage(), e);
229
230	}
231	return null;
232	}
233
234	/** returns a DOM document */
235	public Document getDOM(File in, String encoding)
236	{
237	try
238	{
239
240	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
241	InputSource xml_source = new InputSource(isr);
242
243	DOMParser parser = new DOMParser();
244	parser.setFeature("http://xml.org/sax/features/validation", false);
245	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
246	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
247	if (resolver != null)
248	{
249	parser.setEntityResolver(this.resolver);
250	}
251	parser.setErrorHandler(new ParseErrorHandler());
252	parser.parse(xml_source);
253
254	Document doc = parser.getDocument();
255
256	isr.close();
257	return doc;
258
259	}
260	catch (Exception e)
261	{
262	logger.error(e.getMessage());
263	}
264	return null;
265	}
266
267	/** creates a new empty DOM Document */
268	public Document newDOM()
269	{
270	Document doc = new DocumentImpl();
271	return doc;
272	}
273
274	/**
275	* This method's parameters represent the parts of the Doctype of this
276	* Document that is to be created. For more info see
277	* http://xerces.apache.org
278	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
279	* .html#DocumentTypeImpl
280	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
281	*
282	* */
283	public static Document newDOM(String qualifiedName, String publicID, String systemID)
284	{
285	// create empty DOM document
286	DocumentImpl docImpl = new DocumentImpl();
287
288	// Need to use the document to create the docType for it
289	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
290
291	// Although we have created the docType using the document, we need to still
292	// put it into the empty document we just created
293	try
294	{
295	docImpl.appendChild(myDocType);
296	}
297	catch (Exception e)
298	{
299	System.out.println("Could not append docType because: " + e);
300	}
301
302	// return the document containing a DocType
303	return docImpl;
304	}
305
306	/** returns the Node as a String */
307	public static String getString(Node xmlNode)
308	{
309	outputEscaping = true;
310	StringBuffer xmlRepresentation = new StringBuffer();
311	getString(xmlNode, xmlRepresentation, 0, false);
312	return xmlRepresentation.toString();
313	}
314
315	/**
316	* returns the node as a nicely formatted String - this introduces extra
317	* text nodes if the String is read back in as a DOM, so should only be used
318	* for printing
319	*/
320	public static String getPrettyString(Node xmlNode)
321	{
322
323	outputEscaping = true;
324	StringBuffer xmlRepresentation = new StringBuffer();
325	getString(xmlNode, xmlRepresentation, 0, true);
326	return xmlRepresentation.toString();
327	}
328
329	/*
330	* For the purposes of logger.debug statements, where this is called and
331	* hence outputted, returns an empty string if debugging is not enabled
332	*/
333	public static String getPrettyStringLogger(Node xmlNode, Logger log)
334	{
335
336	if (log.isDebugEnabled())
337	return getPrettyString(xmlNode);
338
339	return "";
340
341	}
342
343	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
344	{
345
346	if (xmlNode == null)
347	{
348	xmlRepresentation.append("<null>");
349	return;
350	}
351
352	short nodeType = xmlNode.getNodeType();
353	String nodeName = xmlNode.getNodeName();
354
355	if (nodeType == Node.DOCUMENT_NODE)
356	{
357	Document xmlDocNode = (Document) xmlNode;
358
359	//if (xmlDocNode.getDoctype() == null) {
360	//System.err.println("Doctype is null.");
361	//}
362	//else {
363	if (xmlDocNode.getDoctype() != null)
364	{
365	DocumentType dt = xmlDocNode.getDoctype();
366
367	String name = dt.getName();
368	String pid = dt.getPublicId();
369	String sid = dt.getSystemId();
370
371	// Use previously assigned name, not dt.getName() again
372	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
373
374	xmlRepresentation.append(doctype_str);
375	}
376	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
377	return;
378	}
379	// Handle Element nodes
380	if (nodeType == Node.ELEMENT_NODE)
381	{
382	if (pretty)
383	{
384	xmlRepresentation.append("\n");
385	for (int i = 0; i < depth; i++)
386	{
387	xmlRepresentation.append(" ");
388	}
389	}
390
391	// Write opening tag
392	xmlRepresentation.append("<");
393	xmlRepresentation.append(nodeName);
394
395	// Write the node attributes
396	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
397	for (int i = 0; i < nodeAttributes.getLength(); i++)
398	{
399	Node attribute = nodeAttributes.item(i);
400	xmlRepresentation.append(" ");
401	xmlRepresentation.append(attribute.getNodeName());
402	xmlRepresentation.append("=\"");
403	xmlRepresentation.append(attribute.getNodeValue());
404	xmlRepresentation.append("\"");
405	}
406
407	// If the node has no children, close the opening tag and return
408	if (xmlNode.hasChildNodes() == false)
409	{
410	// This produces somewhat ugly output, but it is necessary to compensate
411	// for display bugs in Netscape. Firstly, the space is needed before the
412	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
413	// example). Also, a newline character would be expected after the tag,
414	// but this causes problems with the display of links (the link text
415	// will contain a newline character, which is displayed badly).
416	xmlRepresentation.append(" />");
417	return;
418	}
419
420	// Close the opening tag
421	xmlRepresentation.append(">");
422
423	// Apply recursively to the children of this node
424	// hack for nodes next to text nodes - dont make them pretty
425	// this is needed for text inside a <pre> element - any new lines
426	// or spaces around the span elements show up in the text
427	NodeList children = xmlNode.getChildNodes();
428	boolean do_pretty = pretty;
429	for (int i = 0; i < children.getLength(); i++)
430	{
431	if (children.item(i).getNodeType() == Node.TEXT_NODE)
432	{
433	do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
434	}
435	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
436	}
437
438	// Write closing tag
439	if (pretty)
440	{
441	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
442	{
443	for (int i = 0; i < depth; i++)
444	xmlRepresentation.append(" ");
445	}
446	}
447	xmlRepresentation.append("</");
448	xmlRepresentation.append(nodeName);
449	xmlRepresentation.append(">");
450	if (pretty)
451	{
452	xmlRepresentation.append("\n");
453	}
454	}
455
456	// Handle Text nodes
457	else if (nodeType == Node.TEXT_NODE)
458	{
459	String text = xmlNode.getNodeValue();
460
461	// Perform output escaping, if required
462	// Apache Commons replace method is far superior to String.replaceAll - very fast!
463	if (outputEscaping)
464	{
465
466	text = StringUtils.replace(text, "&", "&");
467	text = StringUtils.replace(text, "<", "<");
468	text = StringUtils.replace(text, ">", ">");
469	text = StringUtils.replace(text, "'", "'");
470	text = StringUtils.replace(text, "\"", """);
471	}
472
473	// Remove any control-C characters
474	text = StringUtils.replace(text, "" + (char) 3, "");
475
476	xmlRepresentation.append(text);
477	}
478
479	// Handle Processing Instruction nodes
480	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
481	{
482	if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
483	{
484	outputEscaping = false;
485	}
486	else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
487	{
488	outputEscaping = true;
489	}
490	else
491	{
492	logger.warn("Unhandled processing instruction " + nodeName);
493	}
494	}
495
496	else if (nodeType == Node.COMMENT_NODE)
497	{
498	String text = xmlNode.getNodeValue();
499	xmlRepresentation.append("<!-- ");
500	xmlRepresentation.append(text);
501	xmlRepresentation.append(" -->");
502	}
503
504	// A type of node that is not handled yet
505	else
506	{
507	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
508	}
509
510	return;
511	}
512
513	protected static String getNodeTypeString(short node_type)
514	{
515
516	String type = "";
517	switch (node_type)
518	{
519	case Node.ATTRIBUTE_NODE:
520	type = "ATTRIBUTE_NODE";
521	break;
522	case Node.CDATA_SECTION_NODE:
523	type = "CDATA_SECTION_NODE";
524	break;
525	case Node.COMMENT_NODE:
526	type = "COMMENT_NODE";
527	break;
528	case Node.DOCUMENT_FRAGMENT_NODE:
529	type = "DOCUMENT_FRAGMENT_NODE";
530	break;
531	case Node.DOCUMENT_NODE:
532	type = "DOCUMENT_NODE";
533	break;
534	case Node.DOCUMENT_TYPE_NODE:
535	type = "DOCUMENT_TYPE_NODE";
536	break;
537	case Node.ELEMENT_NODE:
538	type = "ELEMENT_NODE";
539	break;
540	case Node.ENTITY_NODE:
541	type = "ENTITY_NODE";
542	break;
543	case Node.ENTITY_REFERENCE_NODE:
544	type = "ENTITY_REFERENCE_NODE";
545	break;
546	case Node.NOTATION_NODE:
547	type = "NOTATION_NODE";
548	break;
549	case Node.PROCESSING_INSTRUCTION_NODE:
550	type = "PROCESSING_INSTRUCTION_NODE";
551	break;
552	case Node.TEXT_NODE:
553	type = "TEXT_NODE";
554	break;
555	default:
556	type = "UNKNOWN";
557	}
558
559	return type;
560	}
561
562	// returns null if there no error occurred during parsing, or else returns the error message
563	public String getParseErrorMessage()
564	{
565	ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
566	return errorHandler.getErrorMessage();
567	}
568
569	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
570	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
571	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
572	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
573	static public class ParseErrorHandler implements ErrorHandler
574	{
575	protected String errorMessage = null;
576
577	// Receive notification of a recoverable error.
578	public void error(SAXParseException exception)
579	{
580	handleError("Error:\n", exception);
581	}
582
583	// Receive notification of a non-recoverable error.
584	public void fatalError(SAXParseException exception)
585	{
586	handleError("Fatal Error:\n", exception);
587	}
588
589	// Receive notification of a warning.
590	public void warning(SAXParseException exception)
591	{
592	handleError("Warning:\n", exception);
593	}
594
595	public String toString(SAXParseException e)
596	{
597	String msg = e.getMessage();
598	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
599	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
600	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
601
602	return msg;
603	}
604
605	// clears the errorPage variable after first call to this method
606	public String getErrorMessage()
607	{
608	String errMsg = this.errorMessage;
609	if (this.errorMessage != null)
610	{
611	this.errorMessage = null;
612	}
613	return errMsg;
614	}
615
616	// sets the errorMessage member variable to the data stored in the exception
617	// and writes the errorMessage to the logger and tomcat's System.err
618	protected void handleError(String errorType, SAXParseException exception)
619	{
620	this.errorMessage = errorType + toString(exception);
621	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
622	logger.error(this.errorMessage);
623	}
624	}
625	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: