source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16374

Last change on this file since 16374 was 16374, checked in by davidb, 16 years ago

Change to Skin code (returning Node note Element) so XSLT encodes its DocType -- important information for IE to render resultant HTML correctly. This also required Skin.java to be changed from using DomResult to StreamResult. The former is known to have a problem with loosing its DocType info, and as it's then read-only, has no elegant way to put back this info.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.Document;
23import org.w3c.dom.DocumentType;
24import org.w3c.dom.Node;
25import org.w3c.dom.NodeList;
26import org.w3c.dom.NamedNodeMap;
27import org.xml.sax.InputSource;
28import org.xml.sax.EntityResolver;
29import org.apache.xerces.parsers.DOMParser;
30import org.apache.xerces.dom.DocumentImpl; // for new Documents
31
32// other java classes
33import java.io.Reader;
34import java.io.InputStreamReader;
35import java.io.StringReader;
36import java.io.File;
37import java.io.FileInputStream;
38import java.io.FileReader;
39
40import org.apache.log4j.*;
41
42/** XMLConverter - utility class for greenstone
43 *
44 * parses XML Strings into Documents, converts Nodes to Strings
45 * different parsers have different behaviour - can experiment in here
46 * now we only use xerces
47 *
48 * @author <a href="mailto:[email protected]">Katherine Don</a>
49 * @version $Revision: 16374 $
50 *
51 */
52public class XMLConverter {
53
54 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
55
56 /** xerces parser */
57 protected DOMParser parser = null;
58
59 private boolean outputEscaping = true;
60
61
62 /** the no-args constructor */
63 public XMLConverter() {
64 try {
65 this.parser = new DOMParser();
66 this.parser.setFeature("http://xml.org/sax/features/validation", false);
67 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
68 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
69 // a performance test showed that having this on lead to increased
70 // memory use for small-medium docs, and not much gain for large
71 // docs.
72 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
73 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
74 } catch (Exception e) {
75 logger.error(e.getMessage());
76 }
77 }
78
79 /** sets the entity resolver. pass in null to unset it */
80 public void setEntityResolver(EntityResolver er) {
81 this.parser.setEntityResolver(er);
82 }
83
84 /** returns a DOM Document */
85 public Document getDOM(String in) {
86
87 try {
88 Reader reader = new StringReader(in);
89 InputSource xml_source = new InputSource(reader);
90
91 this.parser.parse(xml_source);
92 Document doc = this.parser.getDocument();
93
94 return doc;
95
96 } catch (Exception e) {
97 logger.error(e.getMessage());
98 }
99 return null;
100 }
101
102 /** returns a DOM Document */
103 public Document getDOM(File in) {
104 try {
105 FileReader reader = new FileReader(in);
106 InputSource xml_source = new InputSource(reader);
107 this.parser.parse(xml_source);
108 Document doc = this.parser.getDocument();
109 return doc;
110
111 } catch (Exception e) {
112 logger.error(e.getMessage(), e);
113
114 }
115 return null;
116 }
117
118 /** returns a DOM document */
119 public Document getDOM(File in, String encoding) {
120 try {
121
122 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
123 InputSource xml_source = new InputSource(isr);
124
125 this.parser.parse(xml_source);
126 Document doc = this.parser.getDocument();
127
128 return doc;
129
130 } catch (Exception e) {
131 logger.error(e.getMessage());
132 }
133 return null;
134 }
135
136
137 /** creates a new empty DOM Document */
138 public Document newDOM() {
139 Document doc = new DocumentImpl();
140 return doc;
141 }
142
143 /** returns the Node as a String */
144 public String getString(Node xmlNode)
145 {
146 outputEscaping = true;
147 StringBuffer xmlRepresentation = new StringBuffer();
148 getString(xmlNode, xmlRepresentation, 0, false);
149 return xmlRepresentation.toString();
150 }
151
152 /** returns the node as a nicely formatted String - this introduces extra
153 * text nodes if the String is read back in as a DOM, so should only be
154 * used for printing */
155 public String getPrettyString(Node xmlNode) {
156
157 outputEscaping = true;
158 StringBuffer xmlRepresentation = new StringBuffer();
159 getString(xmlNode, xmlRepresentation, 0, true);
160 return xmlRepresentation.toString();
161 }
162
163 private void getString(Node xmlNode, StringBuffer xmlRepresentation,
164 int depth, boolean pretty)
165 {
166
167 if (xmlNode == null) {
168 xmlRepresentation.append("<null>");
169 return;
170 }
171
172 short nodeType = xmlNode.getNodeType();
173 String nodeName = xmlNode.getNodeName();
174
175 if (nodeType == Node.DOCUMENT_NODE) {
176 Document xmlDocNode = (Document)xmlNode;
177
178 if (xmlDocNode.getDoctype() == null) {
179 System.err.println("Doctype is null.");
180 }
181 else {
182
183 DocumentType dt = xmlDocNode.getDoctype();
184
185 String name = dt.getName();
186 String pid = dt.getPublicId();
187 String sid = dt.getSystemId();
188
189 String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\"" + sid + "\">\n";
190
191 xmlRepresentation.append(doctype_str);
192 }
193 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
194 return;
195 }
196 // Handle Element nodes
197 if (nodeType == Node.ELEMENT_NODE) {
198 if (pretty) {
199 xmlRepresentation.append("\n");
200 for (int i = 0; i < depth; i++) {
201 xmlRepresentation.append(" ");
202 }
203 }
204
205 // Write opening tag
206 xmlRepresentation.append("<");
207 xmlRepresentation.append(nodeName);
208
209 // Write the node attributes
210 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
211 for (int i = 0; i < nodeAttributes.getLength(); i++) {
212 Node attribute = nodeAttributes.item(i);
213 xmlRepresentation.append(" ");
214 xmlRepresentation.append(attribute.getNodeName());
215 xmlRepresentation.append("=\"");
216 xmlRepresentation.append(attribute.getNodeValue());
217 xmlRepresentation.append("\"");
218 }
219
220 // If the node has no children, close the opening tag and return
221 if (xmlNode.hasChildNodes() == false) {
222 // This produces somewhat ugly output, but it is necessary to compensate
223 // for display bugs in Netscape. Firstly, the space is needed before the
224 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
225 // example). Also, a newline character would be expected after the tag,
226 // but this causes problems with the display of links (the link text
227 // will contain a newline character, which is displayed badly).
228 xmlRepresentation.append(" />");
229 return;
230 }
231
232 // Close the opening tag
233 xmlRepresentation.append(">");
234
235 // Apply recursively to the children of this node
236 // hack for nodes next to text nodes - dont make them pretty
237 // this is needed for text inside a <pre> element - any new lines
238 // or spaces around the span elements show up in the text
239 NodeList children = xmlNode.getChildNodes();
240 boolean do_pretty = pretty;
241 for (int i = 0; i < children.getLength(); i++) {
242 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
243 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
244 }
245 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
246 }
247
248 // Write closing tag
249 if (pretty) {
250 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
251 for (int i = 0; i < depth; i++)
252 xmlRepresentation.append(" ");
253 }
254 }
255 xmlRepresentation.append("</");
256 xmlRepresentation.append(nodeName);
257 xmlRepresentation.append(">");
258 if (pretty) {
259 xmlRepresentation.append("\n");
260 }
261 }
262
263 // Handle Text nodes
264 else if (nodeType == Node.TEXT_NODE) {
265 String text = xmlNode.getNodeValue();
266
267 // Perform output escaping, if required
268 if (outputEscaping) {
269 text = text.replaceAll("&", "&amp;"); // Must be done first!!
270 text = text.replaceAll("<", "&lt;");
271 text = text.replaceAll(">", "&gt;");
272 text = text.replaceAll("\"", "&quot;");
273 text = text.replaceAll("\'", "&apos;");
274 }
275
276 // Remove any control-C characters
277 text = text.replaceAll("" + (char) 3, "");
278 xmlRepresentation.append(text);
279 }
280
281 // Handle Processing Instruction nodes
282 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
283 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
284 outputEscaping = false;
285 }
286 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
287 outputEscaping = true;
288 }
289 else {
290 logger.warn("Unhandled processing instruction " + nodeName);
291 }
292 }
293
294 else if (nodeType == Node.COMMENT_NODE) {
295 String text = xmlNode.getNodeValue();
296 xmlRepresentation.append("<!-- ");
297 xmlRepresentation.append(text);
298 xmlRepresentation.append(" -->");
299 }
300
301
302
303 // A type of node that is not handled yet
304 else {
305 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
306 }
307
308 return;
309 }
310
311 protected static String getNodeTypeString(short node_type) {
312
313 String type = "";
314 switch(node_type) {
315 case Node.ATTRIBUTE_NODE:
316 type="ATTRIBUTE_NODE";
317 break;
318 case Node.CDATA_SECTION_NODE:
319 type="CDATA_SECTION_NODE";
320 break;
321 case Node.COMMENT_NODE:
322 type="COMMENT_NODE";
323 break;
324 case Node.DOCUMENT_FRAGMENT_NODE:
325 type="DOCUMENT_FRAGMENT_NODE";
326 break;
327 case Node.DOCUMENT_NODE:
328 type="DOCUMENT_NODE";
329 break;
330 case Node.DOCUMENT_TYPE_NODE:
331 type="DOCUMENT_TYPE_NODE";
332 break;
333 case Node.ELEMENT_NODE:
334 type="ELEMENT_NODE";
335 break;
336 case Node.ENTITY_NODE:
337 type="ENTITY_NODE";
338 break;
339 case Node.ENTITY_REFERENCE_NODE:
340 type="ENTITY_REFERENCE_NODE";
341 break;
342 case Node.NOTATION_NODE:
343 type="NOTATION_NODE";
344 break;
345 case Node.PROCESSING_INSTRUCTION_NODE:
346 type="PROCESSING_INSTRUCTION_NODE";
347 break;
348 case Node.TEXT_NODE:
349 type="TEXT_NODE";
350 break;
351 default:
352 type="UNKNOWN";
353 }
354
355 return type;
356 }
357}
Note: See TracBrowser for help on using the repository browser.