source: trunk/gsdl3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 13210

Last change on this file since 13210 was 13210, checked in by kjdon, 17 years ago

Category is deprecated, should be using Logger instead. turned off external DTD loading in the parser

  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.Document;
23import org.w3c.dom.Node;
24import org.w3c.dom.NodeList;
25import org.w3c.dom.NamedNodeMap;
26import org.xml.sax.InputSource;
27import org.xml.sax.EntityResolver;
28import org.apache.xerces.parsers.DOMParser;
29import org.apache.xerces.dom.DocumentImpl; // for new Documents
30
31// other java classes
32import java.io.Reader;
33import java.io.InputStreamReader;
34import java.io.StringReader;
35import java.io.File;
36import java.io.FileInputStream;
37import java.io.FileReader;
38
39import org.apache.log4j.*;
40
41/** XMLConverter - utility class for greenstone
42 *
43 * parses XML Strings into Documents, converts Nodes to Strings
44 * different parsers have different behaviour - can experiment in here
45 * now we only use xerces
46 *
47 * @author <a href="mailto:[email protected]">Katherine Don</a>
48 * @version $Revision: 13210 $
49 *
50 */
51public class XMLConverter {
52
53 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
54
55 /** xerces parser */
56 protected DOMParser parser = null;
57
58 private boolean outputEscaping = true;
59
60
61 /** the no-args constructor */
62 public XMLConverter() {
63 try {
64 this.parser = new DOMParser();
65 this.parser.setFeature("http://xml.org/sax/features/validation", false);
66 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
67 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
68 // a performance test showed that having this on lead to increased
69 // memory use for small-medium docs, and not much gain for large
70 // docs.
71 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
72 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
73 } catch (Exception e) {
74 logger.error(e.getMessage());
75 }
76 }
77
78 /** sets the entity resolver. pass in null to unset it */
79 public void setEntityResolver(EntityResolver er) {
80 this.parser.setEntityResolver(er);
81 }
82
83 /** returns a DOM Document */
84 public Document getDOM(String in) {
85
86 try {
87 Reader reader = new StringReader(in);
88 InputSource xml_source = new InputSource(reader);
89
90 this.parser.parse(xml_source);
91 Document doc = this.parser.getDocument();
92
93 return doc;
94
95 } catch (Exception e) {
96 logger.error(e.getMessage());
97 }
98 return null;
99 }
100
101 /** returns a DOM Document */
102 public Document getDOM(File in) {
103 try {
104 FileReader reader = new FileReader(in);
105 InputSource xml_source = new InputSource(reader);
106 this.parser.parse(xml_source);
107 Document doc = this.parser.getDocument();
108 return doc;
109
110 } catch (Exception e) {
111 logger.error(e.getMessage(), e);
112
113 }
114 return null;
115 }
116
117 /** returns a DOM document */
118 public Document getDOM(File in, String encoding) {
119 try {
120
121 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
122 InputSource xml_source = new InputSource(isr);
123
124 this.parser.parse(xml_source);
125 Document doc = this.parser.getDocument();
126
127 return doc;
128
129 } catch (Exception e) {
130 logger.error(e.getMessage());
131 }
132 return null;
133 }
134
135
136 /** creates a new empty DOM Document */
137 public Document newDOM() {
138 Document doc = new DocumentImpl();
139 return doc;
140 }
141
142 /** returns the Node as a String */
143 public String getString(Node xmlNode)
144 {
145 outputEscaping = true;
146 StringBuffer xmlRepresentation = new StringBuffer();
147 getString(xmlNode, xmlRepresentation, 0, false);
148 return xmlRepresentation.toString();
149 }
150
151 /** returns the node as a nicely formatted String - this introduces extra
152 * text nodes if the String is read back in as a DOM, so should only be
153 * used for printing */
154 public String getPrettyString(Node xmlNode) {
155
156 outputEscaping = true;
157 StringBuffer xmlRepresentation = new StringBuffer();
158 getString(xmlNode, xmlRepresentation, 0, true);
159 return xmlRepresentation.toString();
160 }
161
162 private void getString(Node xmlNode, StringBuffer xmlRepresentation,
163 int depth, boolean pretty)
164 {
165
166 if (xmlNode == null) {
167 xmlRepresentation.append("<null>");
168 return;
169 }
170
171 short nodeType = xmlNode.getNodeType();
172 String nodeName = xmlNode.getNodeName();
173
174 if (nodeType == Node.DOCUMENT_NODE) {
175 getString(((Document)xmlNode).getDocumentElement(), xmlRepresentation, depth, pretty);
176 return;
177 }
178 // Handle Element nodes
179 if (nodeType == Node.ELEMENT_NODE) {
180 if (pretty) {
181 xmlRepresentation.append("\n");
182 for (int i = 0; i < depth; i++) {
183 xmlRepresentation.append(" ");
184 }
185 }
186
187 // Write opening tag
188 xmlRepresentation.append("<");
189 xmlRepresentation.append(nodeName);
190
191 // Write the node attributes
192 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
193 for (int i = 0; i < nodeAttributes.getLength(); i++) {
194 Node attribute = nodeAttributes.item(i);
195 xmlRepresentation.append(" ");
196 xmlRepresentation.append(attribute.getNodeName());
197 xmlRepresentation.append("=\"");
198 xmlRepresentation.append(attribute.getNodeValue());
199 xmlRepresentation.append("\"");
200 }
201
202 // If the node has no children, close the opening tag and return
203 if (xmlNode.hasChildNodes() == false) {
204 // This produces somewhat ugly output, but it is necessary to compensate
205 // for display bugs in Netscape. Firstly, the space is needed before the
206 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
207 // example). Also, a newline character would be expected after the tag,
208 // but this causes problems with the display of links (the link text
209 // will contain a newline character, which is displayed badly).
210 xmlRepresentation.append(" />");
211 return;
212 }
213
214 // Close the opening tag
215 xmlRepresentation.append(">");
216
217 // Apply recursively to the children of this node
218 // hack for nodes next to text nodes - dont make them pretty
219 // this is needed for text inside a <pre> element - any new lines
220 // or spaces around the span elements show up in the text
221 NodeList children = xmlNode.getChildNodes();
222 boolean do_pretty = pretty;
223 for (int i = 0; i < children.getLength(); i++) {
224 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
225 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
226 }
227 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
228 }
229
230 // Write closing tag
231 if (pretty) {
232 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
233 for (int i = 0; i < depth; i++)
234 xmlRepresentation.append(" ");
235 }
236 }
237 xmlRepresentation.append("</");
238 xmlRepresentation.append(nodeName);
239 xmlRepresentation.append(">");
240 if (pretty) {
241 xmlRepresentation.append("\n");
242 }
243 }
244
245 // Handle Text nodes
246 else if (nodeType == Node.TEXT_NODE) {
247 String text = xmlNode.getNodeValue();
248
249 // Perform output escaping, if required
250 if (outputEscaping) {
251 text = text.replaceAll("&", "&amp;"); // Must be done first!!
252 text = text.replaceAll("<", "&lt;");
253 text = text.replaceAll(">", "&gt;");
254 text = text.replaceAll("\"", "&quot;");
255 text = text.replaceAll("\'", "&apos;");
256 }
257
258 // Remove any control-C characters
259 text = text.replaceAll("" + (char) 3, "");
260 xmlRepresentation.append(text);
261 }
262
263 // Handle Processing Instruction nodes
264 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
265 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
266 outputEscaping = false;
267 }
268 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
269 outputEscaping = true;
270 }
271 else {
272 logger.warn("Unhandled processing instruction " + nodeName);
273 }
274 }
275
276 else if (nodeType == Node.COMMENT_NODE) {
277 String text = xmlNode.getNodeValue();
278 xmlRepresentation.append("<!-- ");
279 xmlRepresentation.append(text);
280 xmlRepresentation.append(" -->");
281 }
282
283
284
285 // A type of node that is not handled yet
286 else {
287 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
288 }
289
290 return;
291 }
292
293 protected static String getNodeTypeString(short node_type) {
294
295 String type = "";
296 switch(node_type) {
297 case Node.ATTRIBUTE_NODE:
298 type="ATTRIBUTE_NODE";
299 break;
300 case Node.CDATA_SECTION_NODE:
301 type="CDATA_SECTION_NODE";
302 break;
303 case Node.COMMENT_NODE:
304 type="COMMENT_NODE";
305 break;
306 case Node.DOCUMENT_FRAGMENT_NODE:
307 type="DOCUMENT_FRAGMENT_NODE";
308 break;
309 case Node.DOCUMENT_NODE:
310 type="DOCUMENT_NODE";
311 break;
312 case Node.DOCUMENT_TYPE_NODE:
313 type="DOCUMENT_TYPE_NODE";
314 break;
315 case Node.ELEMENT_NODE:
316 type="ELEMENT_NODE";
317 break;
318 case Node.ENTITY_NODE:
319 type="ENTITY_NODE";
320 break;
321 case Node.ENTITY_REFERENCE_NODE:
322 type="ENTITY_REFERENCE_NODE";
323 break;
324 case Node.NOTATION_NODE:
325 type="NOTATION_NODE";
326 break;
327 case Node.PROCESSING_INSTRUCTION_NODE:
328 type="PROCESSING_INSTRUCTION_NODE";
329 break;
330 case Node.TEXT_NODE:
331 type="TEXT_NODE";
332 break;
333 default:
334 type="UNKNOWN";
335 }
336
337 return type;
338 }
339}
Note: See TracBrowser for help on using the repository browser.