Context Navigation

source: main/trunk/gli/src/org/greenstone/gatherer/util/XMLTools.java@ 35667

Last change on this file since 35667 was 35667, checked in by davidb, 3 years ago
Some notes on how to address deprecated warnings added
Property svn:keywords set to `Author Date Id Revision`
File size: 34.6 KB

Line
1	package org.greenstone.gatherer.util;
2
3	import java.io.*;
4	import java.net.*;
5	import java.util.*;
6	import org.apache.xerces.parsers.*;
7	import org.apache.xml.serialize.*;
8	import org.greenstone.gatherer.DebugStream;
9	import org.w3c.dom.*;
10	import org.xml.sax.*;
11
12	import java.io.FileReader;
13	import java.io.IOException;
14	import java.io.StringReader;
15	import java.io.StringWriter; // for elementToString()
16
17	// SAX
18	import org.xml.sax.XMLReader;
19	import org.xml.sax.SAXException;
20	import org.xml.sax.SAXParseException;
21	import org.xml.sax.helpers.DefaultHandler;
22	import org.xml.sax.InputSource;
23
24	// JAXP
25	import javax.xml.parsers.DocumentBuilder;
26	import javax.xml.parsers.DocumentBuilderFactory;
27	import javax.xml.parsers.FactoryConfigurationError;
28	import javax.xml.parsers.ParserConfigurationException;
29	import javax.xml.parsers.SAXParser;
30	import javax.xml.parsers.SAXParserFactory;
31	// for elementToString():
32	import javax.xml.transform.OutputKeys;
33	import javax.xml.transform.Transformer;
34	import javax.xml.transform.TransformerFactory;
35	import javax.xml.transform.dom.DOMSource;
36	import javax.xml.transform.stream.StreamResult;
37
38
39	/** This class is a static class containing useful XML functions */
40	public class XMLTools
41	{
42	/** extracts the text out of a node */
43	public static Node getNodeTextNode(Element param)
44	{
45	param.normalize();
46	Node n = param.getFirstChild();
47	while (n != null && n.getNodeType() != Node.TEXT_NODE)
48	{
49	n = n.getNextSibling();
50	}
51	return n;
52	}
53
54	/** extracts the text out of a node */
55	public static String getNodeText(Element param)
56	{
57	Node text_node = getNodeTextNode(param);
58	if (text_node == null)
59	{
60	return "";
61	}
62	return text_node.getNodeValue();
63	}
64
65	public static void setNodeText(Element elem, String text)
66	{
67	Node old_text_node = getNodeTextNode(elem);
68	if (old_text_node != null)
69	{
70	elem.removeChild(old_text_node);
71	}
72	Text t = elem.getOwnerDocument().createTextNode(text);
73	elem.appendChild(t);
74	}
75
76	/** returns the (first) child element with the given name */
77	public static Node getChildByTagName(Node n, String name)
78	{
79
80	Node child = n.getFirstChild();
81	while (child != null)
82	{
83	if (child.getNodeName().equals(name))
84	{
85	return child;
86	}
87	child = child.getNextSibling();
88	}
89	return null; //not found
90	}
91
92	/**
93	* returns the (nth) child element with the given name index numbers start
94	* at 0
95	*/
96	public static Node getChildByTagNameIndexed(Node n, String name, int index)
97	{
98	if (index == -1)
99	{
100	return getChildByTagName(n, name);
101	}
102	int count = 0;
103	Node child = n.getFirstChild();
104	while (child != null)
105	{
106	if (child.getNodeName().equals(name))
107	{
108	if (count == index)
109	{
110	return child;
111	}
112	else
113	{
114	count++;
115	}
116	}
117	child = child.getNextSibling();
118	}
119	return null; //not found
120	}
121
122	/**
123	* returns the element parent/node_name[@attribute_name='attribute_value']
124	*/
125	public static Element getNamedElement(Element parent, String node_name, String attribute_name, String attribute_value)
126	{
127
128	NodeList children = parent.getChildNodes();
129	for (int i = 0; i < children.getLength(); i++)
130	{
131	Node child = children.item(i);
132	//logger.debug("getnamed elem, node nmae="+child.getNodeName());
133	if (child.getNodeName().equals(node_name))
134	{
135	if (((Element) child).getAttribute(attribute_name).equals(attribute_value))
136	return (Element) child;
137	}
138	}
139	// not found
140	return null;
141	}
142
143	/**
144	* returns a list of elements
145	* parent/node_name[@attribute_name='attribute_value']
146	*/
147	public static ArrayList getNamedElementList(Element parent, String node_name, String attribute_name, String attribute_value)
148	{
149	ArrayList elements = new ArrayList();
150	NodeList children = parent.getChildNodes();
151	for (int i = 0; i < children.getLength(); i++)
152	{
153	//System.out.println("getNamedElementList");
154	Node child = children.item(i);
155	//logger.debug("getnamed elem, node nmae="+child.getNodeName());
156	if (child.getNodeName().equals(node_name))
157	{
158	if (((Element) child).getAttribute(attribute_name).equals(attribute_value))
159	elements.add((Element) child);
160	}
161	}
162	// not found
163	if (elements.size() == 0)
164	{
165	elements = null;
166	}
167	return elements;
168	}
169
170	public static void copyAllChildren(Element to, Element from)
171	{
172
173	Document to_doc = to.getOwnerDocument();
174	Node child = from.getFirstChild();
175	while (child != null)
176	{
177	to.appendChild(to_doc.importNode(child, true));
178	child = child.getNextSibling();
179	}
180	}
181
182	/** duplicates all elements in list elements and appends to toElement */
183	public static void duplicateElementList(Document owner, Element toElement, NodeList elements, boolean with_attributes) {
184	int num_elems = elements.getLength();
185	if (num_elems < 1)
186	{
187	return;
188	}
189	for (int i = 0; i < num_elems; i++)
190	{
191	Element to_element = XMLTools.duplicateElement(owner, (Element) elements.item(i), with_attributes);
192	toElement.appendChild(to_element);
193	}
194
195	}
196	/** Duplicates an element */
197	public static Element duplicateElement(Document owner, Element element, boolean with_attributes)
198	{
199	return duplicateElementNS(owner, element, null, with_attributes);
200	}
201
202	/** Duplicates an element */
203	public static Element duplicateElementNS(Document owner, Element element, String namespace_uri, boolean with_attributes)
204	{
205	Element duplicate;
206	if (namespace_uri == null)
207	{
208	duplicate = owner.createElement(element.getTagName());
209	}
210	else
211	{
212	duplicate = owner.createElementNS(namespace_uri, element.getTagName());
213	}
214	// Copy element attributes
215	if (with_attributes)
216	{
217	NamedNodeMap attributes = element.getAttributes();
218	for (int i = 0; i < attributes.getLength(); i++)
219	{
220	Node attribute = attributes.item(i);
221	duplicate.setAttribute(attribute.getNodeName(), attribute.getNodeValue());
222	}
223	}
224
225	// Copy element children
226	NodeList children = element.getChildNodes();
227	for (int i = 0; i < children.getLength(); i++)
228	{
229	Node child = children.item(i);
230	duplicate.appendChild(owner.importNode(child, true));
231	}
232
233	return duplicate;
234	}
235
236	/** Remove all of the child nodes from a certain node. */
237	static final public void clear(Node node)
238	{
239	while (node.hasChildNodes())
240	{
241	node.removeChild(node.getFirstChild());
242	}
243	}
244
245	static public ArrayList getChildElementsByTagName(Element parent_element, String element_name)
246	{
247	ArrayList child_elements = new ArrayList();
248
249	NodeList children_nodelist = parent_element.getChildNodes();
250	for (int i = 0; i < children_nodelist.getLength(); i++)
251	{
252	Node child_node = children_nodelist.item(i);
253	if (child_node.getNodeType() == Node.ELEMENT_NODE && child_node.getNodeName().equals(element_name))
254	{
255	child_elements.add(child_node);
256	}
257	}
258
259	return child_elements;
260	}
261
262	static public String getElementTextValue(Element element)
263	{
264	// Find the first text node child
265	NodeList children_nodelist = element.getChildNodes();
266	for (int i = 0; i < children_nodelist.getLength(); i++)
267	{
268	Node child_node = children_nodelist.item(i);
269	if (child_node.getNodeType() == Node.TEXT_NODE)
270	{
271	return child_node.getNodeValue();
272	}
273	}
274
275	// None found
276	return "";
277	}
278
279	/**
280	* Method to retrieve the value of a given node.
281	*
282	* @param element
283	* The <strong>Element</strong> whose value we wish to find. Soon
284	* to be deprecated!
285	*/
286	static final public String getValue(Node element)
287	{
288	if (element == null)
289	{
290	return "";
291	}
292	// If we've been given a subject node first retrieve its value node.
293	if (element.getNodeName().equals("Subject"))
294	{
295	element = getNodeFromNamed(element, "Value");
296	}
297	// If we've got a value node, then reconstruct the text. Remember that DOM will split text over 256 characters into several text nodes
298	if (element != null && element.hasChildNodes())
299	{
300	StringBuffer text_buffer = new StringBuffer();
301	NodeList text_nodes = element.getChildNodes();
302	for (int i = 0; i < text_nodes.getLength(); i++)
303	{
304	Node possible_text = text_nodes.item(i);
305	if (possible_text.getNodeName().equals(StaticStrings.TEXT_NODE))
306	{
307	text_buffer.append(possible_text.getNodeValue());
308	}
309	}
310	return text_buffer.toString();
311	}
312	return "";
313	}
314
315	/**
316	* Method to retrieve from the node given, a certain child node with the
317	* specified name.
318	*
319	* @param parent
320	* The <strong>Node</strong> whose children should be searched.
321	* @param name
322	* The required nodes name as a <strong>String</strong>.
323	* @return The requested <strong>Node</strong> if it is found, <i>null</i>
324	* otherwise. Soon to be deprecated!
325	*/
326	static final public Node getNodeFromNamed(Node parent, String name)
327	{
328	Node child = null;
329	for (Node i = parent.getFirstChild(); i != null && child == null; i = i.getNextSibling())
330	{
331	if (i.getNodeName().equals(name))
332	{
333	child = i;
334	}
335	}
336	return child;
337	}
338
339	static final public String WELLFORMED = "well-formed !";
340	static final public String NOTWELLFORMED = "not well-formed";
341	static final private String HEADER = "<?xml version='1.0' encoding='UTF-8'?><collectionConfig xmlns:gsf='http://www.greenstone.org/greenstone3/schema/ConfigFormat' xmlns:gslib='http://www.greenstone.org/skinning' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>";
342	static final private String FOOTER = "</collectionConfig>";
343
344
345	public static Document getDOM(String xml_str)
346	{
347	Document doc = null;
348	try {
349
350	DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
351	InputSource is = new InputSource();
352	is.setCharacterStream(new StringReader(xml_str));
353	doc = db.parse(is);
354
355	} catch (Exception e) {
356	e.printStackTrace();
357	}
358	return doc;
359	}
360
361	public static String parse(String xml_str)
362	{
363	String validation_msg = WELLFORMED;
364	xml_str = HEADER + xml_str + FOOTER;
365	try
366	{
367	SAXParserFactory factory = SAXParserFactory.newInstance();
368	factory.setNamespaceAware(true);
369	//factory.setValidating (true);
370	SAXParser parser = factory.newSAXParser();
371	InputSource iSource = new InputSource(new StringReader(xml_str));
372	// parser.parse (iSource, new DefaultHandler ());
373
374	org.xml.sax.XMLReader reader = parser.getXMLReader();
375	reader.setContentHandler(new DefaultHandler());
376	reader.setErrorHandler(new DefaultHandler());
377	reader.parse(iSource);
378	}
379	catch (FactoryConfigurationError e)
380	{
381	validation_msg = "unable to get a document builder factory";
382	}
383	catch (ParserConfigurationException e)
384	{
385	validation_msg = "unable to configure parser";
386	}
387	catch (SAXParseException e)
388	{
389	validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage();
390	}
391	catch (SAXException e)
392	{
393	validation_msg += " Fatal error: " + e.toString();
394	}
395	catch (IOException e)
396	{
397	validation_msg = "Unable to read the input, i/o error";
398	}
399
400	return validation_msg;
401	}
402
403	//In this method, the parsed string xml_str is not wrapped by the header and footer strings.
404	public static String parseDOM(String xml_str)
405	{
406	String validation_msg = WELLFORMED;
407
408	try
409	{
410	SAXParserFactory factory = SAXParserFactory.newInstance();
411	factory.setNamespaceAware(true);
412	//factory.setValidating (true);
413	SAXParser parser = factory.newSAXParser();
414	InputSource iSource = new InputSource(new StringReader(xml_str));
415	// parser.parse (iSource, new DefaultHandler ());
416
417	org.xml.sax.XMLReader reader = parser.getXMLReader();
418	reader.setContentHandler(new DefaultHandler());
419	reader.setErrorHandler(new DefaultHandler());
420	reader.parse(iSource);
421	}
422	catch (FactoryConfigurationError e)
423	{
424	validation_msg = "unable to get a document builder factory";
425	}
426	catch (ParserConfigurationException e)
427	{
428	validation_msg = "unable to configure parser";
429	}
430	catch (SAXParseException e)
431	{
432	validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage();
433	}
434	catch (SAXException e)
435	{
436	validation_msg += " " + e.toString();
437	}
438	catch (IOException e)
439	{
440	validation_msg = "Unable to read the input, i/o error";
441	}
442
443	return validation_msg;
444	}
445
446	public static String parse(File xml_file)
447	{
448	String validation_msg = WELLFORMED;
449
450	try
451	{
452	SAXParserFactory factory = SAXParserFactory.newInstance();
453	factory.setNamespaceAware(true);
454	//factory.setValidating (true);
455	SAXParser parser = factory.newSAXParser();
456	FileReader r = new FileReader(xml_file);
457	InputSource iSource = new InputSource(r);
458	XMLReader reader = parser.getXMLReader();
459	reader.setContentHandler(new DefaultHandler());
460	reader.setErrorHandler(new DefaultHandler());
461	reader.parse(iSource);
462	}
463	catch (FactoryConfigurationError e)
464	{
465	validation_msg = "unable to get a document builder factory";
466	}
467	catch (ParserConfigurationException e)
468	{
469	validation_msg = "unable to configure parser";
470	}
471	catch (SAXParseException e)
472	{
473	validation_msg = NOTWELLFORMED + getLocationString(e) + e.getMessage();
474	}
475	catch (SAXException e)
476	{
477	validation_msg += " Fatal error: " + e.toString();
478	}
479	catch (IOException e)
480	{
481	validation_msg = "Unable to read the input, i/o error";
482	}
483
484	return validation_msg;
485	}
486
487	/** Returns a string of the location. */
488	private static String getLocationString(SAXParseException ex)
489	{
490	StringBuffer str = new StringBuffer();
491
492	String systemId = ex.getSystemId();
493	if (systemId != null)
494	{
495	int index = systemId.lastIndexOf('/');
496	if (index != -1)
497	systemId = systemId.substring(index + 1);
498	str.append(systemId);
499	}
500	str.append("(line ");
501	str.append(ex.getLineNumber() - 1);
502	str.append(", column ");
503	str.append(ex.getColumnNumber());
504	str.append("): ");
505
506	return str.toString();
507
508	} // getLocationString(SAXParseException):String
509
510	/** Parse an XML document from a given file path */
511	static public Document parseXMLFile(String xml_file_path, boolean use_class_loader)
512	{
513	if (use_class_loader == true)
514	{
515	InputStream is = JarTools.getResourceAsStream("/" + xml_file_path);
516	if (is != null)
517	{
518	return parseXML(is);
519	}
520	}
521
522	// Try the file outside the classes directory
523	return parseXMLFile(new File(xml_file_path));
524	}
525
526	/** Parse an XML document from a given file */
527	static public Document parseXMLFile(File xml_file)
528	{
529	// No file? No point trying!
530	if (xml_file.exists() == false)
531	{
532	// System.err.println("@@@ file " + xml_file + " does not exist.");
533	return null;
534	}
535
536	try
537	{
538	return parseXML(new FileInputStream(xml_file));
539	}
540	catch (Exception exception)
541	{
542	DebugStream.printStackTrace(exception);
543	return null;
544	}
545	}
546
547	/** Parse an XML document from a given input stream */
548	static public Document parseXML(InputStream xml_input_stream)
549	{
550	Document document = null;
551
552	try
553	{
554	InputStreamReader isr = new InputStreamReader(xml_input_stream, "UTF-8");
555	document = parseXML(isr);
556	isr.close();
557	xml_input_stream.close();
558	}
559	catch (Exception exception)
560	{
561	DebugStream.printStackTrace(exception);
562	}
563
564	return document;
565	}
566
567	/** Parse an XML document from a given reader */
568	static public Document parseXML(Reader xml_reader)
569	{
570	Document document = null;
571
572	// If debugging, the following will store the XML contents to be parsed,
573	// which can then be inspected upon encountering a SAXException (need to run GLI with -debug on)
574	String xmlContents = "";
575
576	try
577	{
578	Reader reader = null;
579
580	// (1) By default, GLI will remove any contents preceeding (and invalidating)
581	// the XML and present these lines separately to the user
582	if (!DebugStream.isDebuggingEnabled())
583	{
584	try
585	{
586	reader = new BufferedReader(new RemoveContentBeforeRootElementXMLReader(xml_reader));
587	}
588	catch (Exception e)
589	{
590	System.err.println("Exception while wrapping the reader in parseXML(Reader)");
591	e.printStackTrace();
592	}
593	}
594
595	// (2) If we are running GLI in debug mode:
596	// In case parsing exceptions are thrown (SAX Exceptions), we want to get some
597	// idea of where things went wrong. This will print the "XML" contents to either
598	// system.out (if debugging is off) or to the DebugStream otherwise.
599	// We need to read the XML twice to know the line where things went wrong, so
600	// do the additional reading only if we're debugging
601	else
602	{
603	StringBuffer buf = new StringBuffer();
604	char[] buffer = new char[500];
605	int numCharsRead = xml_reader.read(buffer, 0, buffer.length);
606	while (numCharsRead != -1)
607	{
608	buf.append(buffer, 0, numCharsRead);
609	numCharsRead = xml_reader.read(buffer, 0, buffer.length);
610	}
611	xmlContents = buf.toString();
612	xml_reader.close(); // closing the old Reader
613	xml_reader = null;
614	buffer = null;
615	buf = null;
616	// we need a Reader to parse the same contents as the Reader that was just closed
617	reader = new BufferedReader(new StringReader(xmlContents));
618	//System.err.println("xmlContents:\n" + xmlContents);
619	}
620
621	// (2) The actual XML parsing
622	InputSource isc = new InputSource(reader);
623	DOMParser parser = new DOMParser();
624	parser.setFeature("http://xml.org/sax/features/validation", false);
625	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
626	// May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
627	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
628	parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
629	parser.setEntityResolver(new GLIEntityResolver());
630	parser.parse(isc);
631	document = parser.getDocument();
632
633	}
634	catch (SAXParseException e)
635	{
636	showXMLParseFailureLine(e, xmlContents);
637	}
638	catch (SAXException exception)
639	{
640	System.err.println("SAX exception: " + exception.getMessage());
641	if (DebugStream.isDebuggingEnabled())
642	{
643	DebugStream.println("Encountered a SAX exception when parsing the following:\n*******START\n" + xmlContents + "\n**********END\n");
644	// Exit to let the user view the erroneous line/xml before it goes past the screen buffer?
645	DebugStream.println("Debug mode: Exiting the program as there was trouble parsing the XML...");
646	System.exit(-1);
647	}
648	// else, not running in debug mode, so don't exit after exception
649	System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents that could not be parsed.");
650	DebugStream.printStackTrace(exception);
651	}
652	catch (Exception exception)
653	{
654	DebugStream.printStackTrace(exception);
655	}
656
657	return document;
658	}
659
660	/**
661	* Displays the line (string) where the SAXParseException occurred, given a
662	* String of the entire xml that was being parsed and the SAXParseException
663	* object that was caught. The messages are printed to DebugStream, so run
664	* GLI/FLI with -debug to view this output.
665	*
666	* @param xmlContents
667	* is the entire xml that was being parsed when the exception
668	* occurred
669	* @param e
670	* is the SAXParseException object that was thrown upon parsing
671	* the xmlContents.
672	*/
673	public static void showXMLParseFailureLine(SAXParseException e, String xmlContents)
674	{
675
676	// There should be no characters at all that preceed the <?xml>... bit.
677	// The first check is for starting spaces:
678	if (xmlContents.startsWith("\n") \|\| xmlContents.startsWith(" ") \|\| xmlContents.startsWith("\t"))
679	{
680	DebugStream.println("ERROR: illegal start of XML. Space/tab/newline should not preceed xml declaration.\n");
681	DebugStream.println("xmlContents (length is " + xmlContents.length() + "):\n" + xmlContents);
682	return; // nothing more to do, first error identified
683	}
684
685	// the actual line (String literal) where parsing failed and the SAXParseException occurred.
686	String line = "";
687	int linenumber = e.getLineNumber();
688	DebugStream.print("\n****SAXParseException on LINE NUMBER: " + linenumber);
689	if (DebugStream.isDebuggingEnabled())
690	{
691	if (linenumber != -1)
692	{
693	String[] lines = xmlContents.split("\n");
694	if (lines.length > 0)
695	{
696	DebugStream.println(" (number of lines: " + lines.length + ")");
697	if (lines.length >= linenumber)
698	{
699	line = lines[linenumber - 1];
700	}
701	else
702	{ // error is past the last line
703	line = "Error is past the last line (" + lines.length + "): " + lines[lines.length - 1];
704	}
705	}
706	else
707	{
708	DebugStream.print("\n");
709	}
710	lines = null;
711
712	DebugStream.println("The parsing error occurred on this line:\n*********START\n" + line + "\n*********END");
713	DebugStream.println("SAXParseException message: " + e.getMessage() + "\n");
714
715	// Uncomment if you want to print out the entire contents of the XML doc:
716	//DebugStream.println("\n\nThis was the XML:\n*********START\n"
717	// + xmlContents + "\n************END\n");
718	}
719	else
720	{ // no particular line number, print out all the xml so debugger can inspect it
721	DebugStream.println("Encountered a SAX exception when parsing the following:\n*******START\n" + xmlContents + "\n**********END\n");
722	}
723	// Exit to let the user view the erroneous line/xml before it goes past the screen buffer?
724	DebugStream.println("\nDebug mode: Exiting the program as there was trouble parsing the XML...");
725	System.exit(-1);
726	}
727	else
728	{ // not running in debug mode
729	System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents/line that could not be parsed.");
730	}
731	}
732
733	static public StringBuffer readXMLStream(InputStream input_stream)
734	{
735	StringBuffer xml = new StringBuffer("");
736	try {
737	InputStreamReader isr = new InputStreamReader(input_stream, "UTF-8");
738	xml = XMLTools.readXMLStream(new InputStreamReader(input_stream, "UTF-8"));
739	} catch (UnsupportedEncodingException error) {
740	System.err.println("Failed when trying to parse XML stream");
741	error.printStackTrace();
742	}
743
744	return xml;
745	}
746
747	static public StringBuffer readXMLStream(String s) {
748	return XMLTools.readXMLStream(new StringReader(s));
749	}
750
751
752	static public StringBuffer readXMLStream(Reader reader)
753	{
754	StringBuffer xml = new StringBuffer("");
755
756	try
757	{
758	BufferedReader buffered_in = new BufferedReader(reader);
759
760	String line = "";
761	boolean xml_content = false;
762	while ((line = buffered_in.readLine()) != null)
763	{
764	if (xml_content)
765	{
766	xml.append(line);
767	xml.append("\n");
768	}
769	else if (line.trim().startsWith("<?xml"))
770	{
771	xml_content = true;
772	xml.append(line);
773	xml.append("\n");
774	}
775	else
776	{
777	System.err.println(line);
778	}
779	}
780	buffered_in = null;
781	}
782	catch (Exception error)
783	{
784	System.err.println("Failed when trying to parse XML stream");
785	error.printStackTrace();
786	}
787
788	return xml;
789	}
790
791	/**
792	* Removes characters that are invalid in XML (see
793	* http://www.w3.org/TR/2000/REC-xml-20001006#charsets)
794	*/
795	static public String removeInvalidCharacters(String text)
796	{
797	char[] safe_characters = new char[text.length()];
798	int j = 0;
799
800	char[] raw_characters = new char[text.length()];
801	text.getChars(0, text.length(), raw_characters, 0);
802	for (int i = 0; i < raw_characters.length; i++)
803	{
804	char character = raw_characters[i];
805	if ((character >= 0x20 && character <= 0xD7FF) \|\| character == 0x09 \|\| character == 0x0A \|\| character == 0x0D \|\| (character >= 0xE000 && character <= 0xFFFD) \|\| (character >= 0x10000 && character <= 0x10FFFF))
806	{
807	safe_characters[j] = character;
808	j++;
809	}
810	}
811
812	return new String(safe_characters, 0, j);
813	}
814
815	static public void setElementTextValue(Element element, String text)
816	{
817	// Remove all text node children
818	NodeList children_nodelist = element.getChildNodes();
819	for (int i = children_nodelist.getLength() - 1; i >= 0; i--)
820	{
821	Node child_node = children_nodelist.item(i);
822	if (child_node.getNodeType() == Node.TEXT_NODE)
823	{
824	element.removeChild(child_node);
825	}
826	}
827
828	// Add a new text node
829	if (text != null)
830	{
831	element.appendChild(element.getOwnerDocument().createTextNode(text));
832	}
833	}
834
835	/**
836	* Set the #text node value of some element.
837	*
838	* @param element
839	* the Element whose value we wish to set
840	* @param value
841	* the new value for the element as a String Soon to be
842	* deprecated!
843	*/
844	static final public void setValue(Element element, String value)
845	{
846	// Remove any existing child node(s)
847	clear(element);
848	// Add new text node.
849	if (value != null)
850	{
851	element.appendChild(element.getOwnerDocument().createTextNode(value));
852	}
853	}
854
855	static public void indentXML(Element elem, int depth)
856	{
857	Document doc = elem.getOwnerDocument();
858
859	String startIndentString = "\n";
860	for (int i = 0; i < depth; i++)
861	{
862	startIndentString += "\t";
863	}
864	Node startTextNode = doc.createTextNode(startIndentString);
865
866	String endIndentString = "\n";
867	for (int i = 0; i < depth - 1; i++)
868	{
869	endIndentString += "\t";
870	}
871	Node endTextNode = doc.createTextNode(endIndentString);
872
873	boolean found = false;
874	Node child = elem.getFirstChild();
875	while (child != null)
876	{
877	// first clear all empty text nodes (those containing space characters like \n,\r,\t and such)
878	if(child.getNodeType() == Node.TEXT_NODE && child.getNodeValue().matches("^\\s*$"))
879	{
880	Node spaceTextNode = child;
881	child = child.getNextSibling();
882	elem.removeChild(spaceTextNode);
883
884	if(child == null) break;
885	}
886
887	// now process normal element nodes as intended
888	if (child.getNodeType() == Node.ELEMENT_NODE)
889	{
890	found = true;
891	break;
892	}
893	child = child.getNextSibling();
894	}
895
896	if (found)
897	{
898	elem.appendChild(endTextNode);
899	}
900
901	child = elem.getFirstChild();
902	while (child != null)
903	{
904	// Again, need to first clear all empty text nodes (those containing space characters like \n,\r,\t and such)
905	// because the first while loop above would break out when it found an element node and wouldn't have got rid
906	// of all the empty text nodes yet.
907	// This time, beware not to delete the special end and start empty textnodes just added, since
908	// they've been created and inserted specifically.
909	if(child != endTextNode && child != startTextNode
910	&& child.getNodeType() == Node.TEXT_NODE && child.getNodeValue().matches("^\\s*$"))
911	{
912	Node spaceTextNode = child;
913	child = child.getNextSibling();
914	elem.removeChild(spaceTextNode);
915
916	if(child == null) break;
917	}
918
919	// go back to processing normal element nodes as intended
920	if (child.getNodeType() == Node.ELEMENT_NODE)
921	{
922	elem.insertBefore(startTextNode.cloneNode(false), child);
923	indentXML((Element) child, depth + 1);
924	}
925	child = child.getNextSibling();
926	}
927	}
928
929	/**
930	* Write an XML document to a given file with the text node of the specified
931	* element unescaped
932	*/
933	static public void writeXMLFile(File xml_file, Document document, String[] nonEscapingTagNames)
934	{
935	indentXML(document.getDocumentElement(), 1);
936	try
937	{
938	// OutputFormat() and XMLSerlalizer() have been deprecated
939	// StackOverflow suggested alternative is to use LSSerializer class from the package org.w3c.dom.ls
940	// Some example code doing this at:
941	// https://stackoverflow.com/questions/55729019/xmlserializer-outputformat-deprecated
942
943	OutputStream os = new FileOutputStream(xml_file);
944	// Create an output format for our document.
945	OutputFormat f = new OutputFormat(document);
946	f.setEncoding("UTF-8");
947	f.setIndenting(true);
948	f.setLineWidth(0); // Why isn't this working!
949	f.setPreserveSpace(true);
950	if (nonEscapingTagNames != null)
951	{
952	f.setNonEscapingElements(nonEscapingTagNames);
953	}
954	// Create the necessary writer stream for serialization.
955	OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");
956	Writer w = new BufferedWriter(osw);
957	// Generate a new serializer from the above.
958	XMLSerializer s = new XMLSerializer(w, f);
959	s.asDOMSerializer();
960	// Finally serialize the document to file.
961	s.serialize(document);
962	// And close.
963	os.close();
964
965	// Useful for debugging unescaped chars that may otherwise break stuff
966	// w = new StringWriter();
967	// // Generate a new serializer from the above.
968	// s = new XMLSerializer(w, f);
969	// s.asDOMSerializer();
970	// // Finally serialize the document to file.
971	// s.serialize(document);
972
973	// System.err.println("@@@@ XMLTools: wrote out to doc:\n" + w.toString());
974
975	}
976	catch (Exception exception)
977	{
978	DebugStream.printStackTrace(exception);
979	}
980	}
981
982	/** Write an XML document to a given file */
983	static public void writeXMLFile(File xml_file, Document document)
984	{
985	writeXMLFile(xml_file, document, null);
986	}
987
988	public static void printXMLNode(Node e)
989	{
990	printXMLNode(e, 0);
991	}
992
993	public static void printXMLNode(Node e, int depth)
994	{ //recursive method call using DOM API...
995
996	for (int i = 0; i < depth; i++)
997	System.out.print(' ');
998
999	if (e.getNodeType() == Node.TEXT_NODE)
1000	{
1001	//System.out.println("text") ;
1002	if (e.getNodeValue() != "")
1003	{
1004	System.out.println(e.getNodeValue());
1005	}
1006	return;
1007	}
1008
1009	System.out.print('<');
1010	System.out.print(e.getNodeName());
1011	NamedNodeMap attrs = e.getAttributes();
1012	if (attrs != null)
1013	{
1014	for (int i = 0; i < attrs.getLength(); i++)
1015	{
1016	Node attr = attrs.item(i);
1017	System.out.print(' ');
1018	System.out.print(attr.getNodeName());
1019	System.out.print("=\"");
1020	System.out.print(attr.getNodeValue());
1021	System.out.print('"');
1022	}
1023	}
1024	NodeList children = e.getChildNodes();
1025
1026	if (children == null \|\| children.getLength() == 0)
1027	System.out.println("/>");
1028	else
1029	{
1030
1031	System.out.println('>');
1032
1033	int len = children.getLength();
1034	for (int i = 0; i < len; i++)
1035	{
1036	printXMLNode(children.item(i), depth + 1);
1037	}
1038
1039	for (int i = 0; i < depth; i++)
1040	System.out.print(' ');
1041
1042	System.out.println("</" + e.getNodeName() + ">");
1043	}
1044
1045	}
1046
1047	public static String xmlNodeToString(Node e)
1048	{
1049	StringBuffer sb = new StringBuffer("");
1050	xmlNodeToString(sb, e, true, "\t", 2);
1051	return sb.toString();
1052	}
1053
1054	public static void xmlNodeToString(StringBuffer sb, Node e, boolean indent, String indentString, int depth)
1055	{
1056
1057	if (e.getNodeType() == Node.CDATA_SECTION_NODE)
1058	{
1059	if (e.getNodeValue() != "")
1060	{
1061	String text = e.getNodeValue();
1062	sb.append("<![CDATA[");
1063	sb.append(text);
1064	sb.append("]]>");
1065	}
1066	return;
1067	}
1068
1069	if (e.getNodeType() == Node.TEXT_NODE)
1070	{
1071	if (e.getNodeValue() != "")
1072	{
1073	String text = e.getNodeValue();
1074	text = text.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">").replaceAll("[\\n\\r\\t\\s]*$", "");
1075	for (Character c : text.toCharArray())
1076	{
1077	if (c.equals('\n'))
1078	{
1079	text = text.replaceAll("^[\\n\\r\\t\\s]*", "");
1080	break;
1081	}
1082
1083	if (!Character.isWhitespace(c))
1084	{
1085	break;
1086	}
1087	}
1088	sb.append(text);
1089	}
1090	return;
1091	}
1092
1093	if (e.getNodeType() == Node.COMMENT_NODE)
1094	{
1095	if (e.getNodeValue() != "")
1096	{
1097	sb.append("<!--\n" + e.getNodeValue().trim() + "\n-->\n");
1098	}
1099	return;
1100	}
1101
1102	if (indent)
1103	{
1104	for (int i = 0; i < depth; i++)
1105	{
1106	sb.append(indentString);
1107	}
1108	}
1109
1110	sb.append('<');
1111	sb.append(e.getNodeName());
1112	NamedNodeMap attrs = e.getAttributes();
1113	if (attrs != null)
1114	{
1115	for (int i = 0; i < attrs.getLength(); i++)
1116	{
1117	Node attr = attrs.item(i);
1118	sb.append(' ');
1119	sb.append(attr.getNodeName());
1120	sb.append("=\"");
1121	sb.append(attr.getNodeValue().replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"));
1122	sb.append('"');
1123	}
1124	}
1125	NodeList children = e.getChildNodes();
1126
1127	boolean hasElements = false;
1128	boolean indentSwapped = false;
1129	for (int i = 0; i < children.getLength(); i++)
1130	{
1131	if (children.item(i).getNodeType() == Node.ELEMENT_NODE)
1132	{
1133	hasElements = true;
1134	}
1135	if ((children.item(i).getNodeType() == Node.TEXT_NODE \|\| children.item(i).getNodeType() == Node.CDATA_SECTION_NODE) && indent)
1136	{
1137	if (children.item(i).getNodeValue().trim().length() > 0)
1138	{
1139	indentSwapped = true;
1140	indent = false;
1141	}
1142	}
1143	}
1144
1145	if (children == null \|\| children.getLength() == 0)
1146	{
1147	sb.append("/>");
1148
1149	if (indent)
1150	{
1151	sb.append("\n");
1152	}
1153	}
1154	else
1155	{
1156	sb.append(">");
1157	if (hasElements && indent)
1158	{
1159	sb.append("\n");
1160	}
1161
1162	int len = children.getLength();
1163	for (int i = 0; i < len; i++)
1164	{
1165	xmlNodeToString(sb, children.item(i), indent, indentString, depth + 1);
1166	}
1167
1168	if (indent)
1169	{
1170	for (int i = 0; i < depth; i++)
1171	{
1172	sb.append(indentString);
1173	}
1174	}
1175
1176	sb.append("</" + e.getNodeName() + ">");
1177
1178	if ((hasElements && indent) \|\| indentSwapped)
1179	{
1180	sb.append("\n");
1181	}
1182	}
1183	}
1184
1185	public static String xmlNodeToStringWithoutIndenting(Node e)
1186	{
1187	StringBuffer sb = new StringBuffer("");
1188	xmlNodeToStringWithoutNewline(sb, e, -1);
1189	return sb.toString();
1190	}
1191
1192	public static String xmlNodeToStringWithoutNewline(Node e)
1193	{
1194	StringBuffer sb = new StringBuffer("");
1195	xmlNodeToStringWithoutNewline(sb, e, 0);
1196	return sb.toString();
1197	}
1198
1199	private static void xmlNodeToStringWithoutNewline(StringBuffer sb, Node e, int depth)
1200	{
1201
1202	for (int i = 0; i < depth; i++)
1203	{
1204	sb.append(' ');
1205	}
1206
1207	if (e.getNodeType() == Node.TEXT_NODE)
1208	{
1209	if (e.getNodeValue() != "")
1210	{
1211	sb.append(e.getNodeValue().replaceAll("&", "&").replaceAll("<", "<").replace(">", ">"));
1212	}
1213	return;
1214	}
1215
1216	if (e.getNodeType() == Node.COMMENT_NODE)
1217	{
1218	if (e.getNodeValue() != "")
1219	{
1220	sb.append("<!--" + e.getNodeValue() + "-->");
1221	}
1222	return;
1223	}
1224
1225	sb.append('<');
1226	sb.append(e.getNodeName());
1227	NamedNodeMap attrs = e.getAttributes();
1228	if (attrs != null)
1229	{
1230	for (int i = 0; i < attrs.getLength(); i++)
1231	{
1232	Node attr = attrs.item(i);
1233	sb.append(' ');
1234	sb.append(attr.getNodeName());
1235	sb.append("=\"");
1236	sb.append(attr.getNodeValue());
1237	sb.append('"');
1238	}
1239	}
1240	NodeList children = e.getChildNodes();
1241
1242	if (children == null \|\| children.getLength() == 0)
1243	sb.append("/>");
1244	else
1245	{
1246
1247	sb.append(">");
1248
1249	int len = children.getLength();
1250	for (int i = 0; i < len; i++)
1251	{
1252	if (depth >= 0)
1253	{
1254	xmlNodeToStringWithoutNewline(sb, children.item(i), depth + 1);
1255	}
1256	else
1257	{
1258	xmlNodeToStringWithoutNewline(sb, children.item(i), depth);
1259	}
1260	}
1261
1262	for (int i = 0; i < depth; i++)
1263	sb.append(' ');
1264
1265	sb.append("</" + e.getNodeName() + ">");
1266	}
1267	}
1268
1269
1270
1271	// This method will convert an Element to a String too, like xmlNodeToString() above.
1272	// But for a document root element (doc.getDocumentElement()), this method will additionally
1273	// return its processing instruction line at the start (<?xml ... ?>).
1274	// This method copied into GLI from src/java/org/greenstone/gsdl3/util/GSXML.java
1275	public static String elementToString(Element e, boolean indent)
1276	{
1277	String str = "";
1278	try
1279	{
1280	TransformerFactory tf = TransformerFactory.newInstance();
1281	Transformer trans = tf.newTransformer();
1282	StringWriter sw = new StringWriter();
1283	if (indent)
1284	{
1285	trans.setOutputProperty(OutputKeys.INDENT, "yes");
1286	}
1287	else
1288	{
1289	trans.setOutputProperty(OutputKeys.INDENT, "no");
1290	}
1291	trans.transform(new DOMSource(e), new StreamResult(sw));
1292	str = sw.toString();
1293	}
1294	catch (Exception ex)
1295	{
1296	str += "Exception: couldn't write " + e + " to log";
1297	}
1298	finally
1299	{
1300	return str;
1301	}
1302	}
1303	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: