source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16780

Last change on this file since 16780 was 16780, checked in by ak19, 16 years ago

Made new nodeToElement method in XMLConverter static so that some classes that need to use just this method, don't have to instantiate an XMLConverter containing DOMParser and all merely to use this method.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.Document;
23import org.w3c.dom.DocumentType;
24import org.w3c.dom.Element;
25import org.w3c.dom.Node;
26import org.w3c.dom.NodeList;
27import org.w3c.dom.NamedNodeMap;
28import org.xml.sax.InputSource;
29import org.xml.sax.EntityResolver;
30import org.apache.xerces.parsers.DOMParser;
31import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33// other java classes
34import java.io.Reader;
35import java.io.InputStreamReader;
36import java.io.StringReader;
37import java.io.File;
38import java.io.FileInputStream;
39import java.io.FileReader;
40
41import org.apache.log4j.*;
42
43/** XMLConverter - utility class for greenstone
44 *
45 * parses XML Strings into Documents, converts Nodes to Strings
46 * different parsers have different behaviour - can experiment in here
47 * now we only use xerces
48 *
49 * @author <a href="mailto:[email protected]">Katherine Don</a>
50 * @version $Revision: 16780 $
51 *
52 */
53public class XMLConverter {
54
55 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57 /** xerces parser */
58 protected DOMParser parser = null;
59
60 private boolean outputEscaping = true;
61
62
63 /** the no-args constructor */
64 public XMLConverter() {
65 try {
66 this.parser = new DOMParser();
67 this.parser.setFeature("http://xml.org/sax/features/validation", false);
68 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70 // a performance test showed that having this on lead to increased
71 // memory use for small-medium docs, and not much gain for large
72 // docs.
73 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75 } catch (Exception e) {
76 logger.error(e.getMessage());
77 }
78 }
79
80 /** sets the entity resolver. pass in null to unset it */
81 public void setEntityResolver(EntityResolver er) {
82 this.parser.setEntityResolver(er);
83 }
84
85 /** Given a Node representing an Element or Document, will return the
86 * Element/docroot Element. Returns null if the Node was not an element. */
87 public static Element nodeToElement(Node node)
88 {
89 short nodeType = node.getNodeType();
90
91 if (nodeType == Node.DOCUMENT_NODE) {
92 Document docNode = (Document)node;
93 return docNode.getDocumentElement() ;
94 }
95 else if (nodeType == Node.ELEMENT_NODE) {
96 return (Element)node;
97 }
98 else {
99 String message = "Expecting Document or Element node type but got "
100 + node.getNodeName() + "\nReturning null";
101 System.err.println(message);
102 logger.warn(message);
103 return null;
104 }
105 }
106
107 /** returns a DOM Document */
108 public Document getDOM(String in) {
109
110 try {
111 Reader reader = new StringReader(in);
112 InputSource xml_source = new InputSource(reader);
113
114 this.parser.parse(xml_source);
115 Document doc = this.parser.getDocument();
116
117 return doc;
118
119 } catch (Exception e) {
120 logger.error(e.getMessage());
121 }
122 return null;
123 }
124
125 /** returns a DOM Document */
126 public Document getDOM(File in) {
127 try {
128 FileReader reader = new FileReader(in);
129 InputSource xml_source = new InputSource(reader);
130 this.parser.parse(xml_source);
131 Document doc = this.parser.getDocument();
132 return doc;
133
134 } catch (Exception e) {
135 logger.error(e.getMessage(), e);
136
137 }
138 return null;
139 }
140
141 /** returns a DOM document */
142 public Document getDOM(File in, String encoding) {
143 try {
144
145 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
146 InputSource xml_source = new InputSource(isr);
147
148 this.parser.parse(xml_source);
149 Document doc = this.parser.getDocument();
150
151 return doc;
152
153 } catch (Exception e) {
154 logger.error(e.getMessage());
155 }
156 return null;
157 }
158
159
160 /** creates a new empty DOM Document */
161 public Document newDOM() {
162 Document doc = new DocumentImpl();
163 return doc;
164 }
165
166 /** returns the Node as a String */
167 public String getString(Node xmlNode)
168 {
169 outputEscaping = true;
170 StringBuffer xmlRepresentation = new StringBuffer();
171 getString(xmlNode, xmlRepresentation, 0, false);
172 return xmlRepresentation.toString();
173 }
174
175 /** returns the node as a nicely formatted String - this introduces extra
176 * text nodes if the String is read back in as a DOM, so should only be
177 * used for printing */
178 public String getPrettyString(Node xmlNode) {
179
180 outputEscaping = true;
181 StringBuffer xmlRepresentation = new StringBuffer();
182 getString(xmlNode, xmlRepresentation, 0, true);
183 return xmlRepresentation.toString();
184 }
185
186 private void getString(Node xmlNode, StringBuffer xmlRepresentation,
187 int depth, boolean pretty)
188 {
189
190 if (xmlNode == null) {
191 xmlRepresentation.append("<null>");
192 return;
193 }
194
195 short nodeType = xmlNode.getNodeType();
196 String nodeName = xmlNode.getNodeName();
197
198 if (nodeType == Node.DOCUMENT_NODE) {
199 Document xmlDocNode = (Document)xmlNode;
200
201 if (xmlDocNode.getDoctype() == null) {
202 System.err.println("Doctype is null.");
203 }
204 else {
205
206 DocumentType dt = xmlDocNode.getDoctype();
207
208 String name = dt.getName();
209 String pid = dt.getPublicId();
210 String sid = dt.getSystemId();
211
212 String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
213
214 xmlRepresentation.append(doctype_str);
215 }
216 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
217 return;
218 }
219 // Handle Element nodes
220 if (nodeType == Node.ELEMENT_NODE) {
221 if (pretty) {
222 xmlRepresentation.append("\n");
223 for (int i = 0; i < depth; i++) {
224 xmlRepresentation.append(" ");
225 }
226 }
227
228 // Write opening tag
229 xmlRepresentation.append("<");
230 xmlRepresentation.append(nodeName);
231
232 // Write the node attributes
233 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
234 for (int i = 0; i < nodeAttributes.getLength(); i++) {
235 Node attribute = nodeAttributes.item(i);
236 xmlRepresentation.append(" ");
237 xmlRepresentation.append(attribute.getNodeName());
238 xmlRepresentation.append("=\"");
239 xmlRepresentation.append(attribute.getNodeValue());
240 xmlRepresentation.append("\"");
241 }
242
243 // If the node has no children, close the opening tag and return
244 if (xmlNode.hasChildNodes() == false) {
245 // This produces somewhat ugly output, but it is necessary to compensate
246 // for display bugs in Netscape. Firstly, the space is needed before the
247 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
248 // example). Also, a newline character would be expected after the tag,
249 // but this causes problems with the display of links (the link text
250 // will contain a newline character, which is displayed badly).
251 xmlRepresentation.append(" />");
252 return;
253 }
254
255 // Close the opening tag
256 xmlRepresentation.append(">");
257
258 // Apply recursively to the children of this node
259 // hack for nodes next to text nodes - dont make them pretty
260 // this is needed for text inside a <pre> element - any new lines
261 // or spaces around the span elements show up in the text
262 NodeList children = xmlNode.getChildNodes();
263 boolean do_pretty = pretty;
264 for (int i = 0; i < children.getLength(); i++) {
265 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
266 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
267 }
268 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
269 }
270
271 // Write closing tag
272 if (pretty) {
273 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
274 for (int i = 0; i < depth; i++)
275 xmlRepresentation.append(" ");
276 }
277 }
278 xmlRepresentation.append("</");
279 xmlRepresentation.append(nodeName);
280 xmlRepresentation.append(">");
281 if (pretty) {
282 xmlRepresentation.append("\n");
283 }
284 }
285
286 // Handle Text nodes
287 else if (nodeType == Node.TEXT_NODE) {
288 String text = xmlNode.getNodeValue();
289
290 // Perform output escaping, if required
291 if (outputEscaping) {
292 text = text.replaceAll("&", "&amp;"); // Must be done first!!
293 text = text.replaceAll("<", "&lt;");
294 text = text.replaceAll(">", "&gt;");
295 text = text.replaceAll("\"", "&quot;");
296 text = text.replaceAll("\'", "&apos;");
297 }
298
299 // Remove any control-C characters
300 text = text.replaceAll("" + (char) 3, "");
301 xmlRepresentation.append(text);
302 }
303
304 // Handle Processing Instruction nodes
305 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
306 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
307 outputEscaping = false;
308 }
309 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
310 outputEscaping = true;
311 }
312 else {
313 logger.warn("Unhandled processing instruction " + nodeName);
314 }
315 }
316
317 else if (nodeType == Node.COMMENT_NODE) {
318 String text = xmlNode.getNodeValue();
319 xmlRepresentation.append("<!-- ");
320 xmlRepresentation.append(text);
321 xmlRepresentation.append(" -->");
322 }
323
324
325
326 // A type of node that is not handled yet
327 else {
328 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
329 }
330
331 return;
332 }
333
334 protected static String getNodeTypeString(short node_type) {
335
336 String type = "";
337 switch(node_type) {
338 case Node.ATTRIBUTE_NODE:
339 type="ATTRIBUTE_NODE";
340 break;
341 case Node.CDATA_SECTION_NODE:
342 type="CDATA_SECTION_NODE";
343 break;
344 case Node.COMMENT_NODE:
345 type="COMMENT_NODE";
346 break;
347 case Node.DOCUMENT_FRAGMENT_NODE:
348 type="DOCUMENT_FRAGMENT_NODE";
349 break;
350 case Node.DOCUMENT_NODE:
351 type="DOCUMENT_NODE";
352 break;
353 case Node.DOCUMENT_TYPE_NODE:
354 type="DOCUMENT_TYPE_NODE";
355 break;
356 case Node.ELEMENT_NODE:
357 type="ELEMENT_NODE";
358 break;
359 case Node.ENTITY_NODE:
360 type="ENTITY_NODE";
361 break;
362 case Node.ENTITY_REFERENCE_NODE:
363 type="ENTITY_REFERENCE_NODE";
364 break;
365 case Node.NOTATION_NODE:
366 type="NOTATION_NODE";
367 break;
368 case Node.PROCESSING_INSTRUCTION_NODE:
369 type="PROCESSING_INSTRUCTION_NODE";
370 break;
371 case Node.TEXT_NODE:
372 type="TEXT_NODE";
373 break;
374 default:
375 type="UNKNOWN";
376 }
377
378 return type;
379 }
380}
Note: See TracBrowser for help on using the repository browser.