source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16690

Last change on this file since 16690 was 16690, checked in by max, 16 years ago

Fix the missing space and double quotes in the Doctype output.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.Document;
23import org.w3c.dom.DocumentType;
24import org.w3c.dom.Element;
25import org.w3c.dom.Node;
26import org.w3c.dom.NodeList;
27import org.w3c.dom.NamedNodeMap;
28import org.xml.sax.InputSource;
29import org.xml.sax.EntityResolver;
30import org.apache.xerces.parsers.DOMParser;
31import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33// other java classes
34import java.io.Reader;
35import java.io.InputStreamReader;
36import java.io.StringReader;
37import java.io.File;
38import java.io.FileInputStream;
39import java.io.FileReader;
40
41import org.apache.log4j.*;
42
43/** XMLConverter - utility class for greenstone
44 *
45 * parses XML Strings into Documents, converts Nodes to Strings
46 * different parsers have different behaviour - can experiment in here
47 * now we only use xerces
48 *
49 * @author <a href="mailto:[email protected]">Katherine Don</a>
50 * @version $Revision: 16690 $
51 *
52 */
53public class XMLConverter {
54
55 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57 /** xerces parser */
58 protected DOMParser parser = null;
59
60 private boolean outputEscaping = true;
61
62
63 /** the no-args constructor */
64 public XMLConverter() {
65 try {
66 this.parser = new DOMParser();
67 this.parser.setFeature("http://xml.org/sax/features/validation", false);
68 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70 // a performance test showed that having this on lead to increased
71 // memory use for small-medium docs, and not much gain for large
72 // docs.
73 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75 } catch (Exception e) {
76 logger.error(e.getMessage());
77 }
78 }
79
80 /** sets the entity resolver. pass in null to unset it */
81 public void setEntityResolver(EntityResolver er) {
82 this.parser.setEntityResolver(er);
83 }
84
85
86 public Element nodeToElement(Node node)
87 {
88 short nodeType = node.getNodeType();
89
90 if (nodeType == Node.DOCUMENT_NODE) {
91 Document docNode = (Document)node;
92 return docNode.getDocumentElement() ;
93 }
94 else if (nodeType == Node.ELEMENT_NODE) {
95 return (Element)node;
96 }
97 else {
98 System.err.println("Expecting Document or Element node type but got " + node.getNodeName());
99 System.err.println("Returning null");
100 return null;
101 }
102 }
103
104 /** returns a DOM Document */
105 public Document getDOM(String in) {
106
107 try {
108 Reader reader = new StringReader(in);
109 InputSource xml_source = new InputSource(reader);
110
111 this.parser.parse(xml_source);
112 Document doc = this.parser.getDocument();
113
114 return doc;
115
116 } catch (Exception e) {
117 logger.error(e.getMessage());
118 }
119 return null;
120 }
121
122 /** returns a DOM Document */
123 public Document getDOM(File in) {
124 try {
125 FileReader reader = new FileReader(in);
126 InputSource xml_source = new InputSource(reader);
127 this.parser.parse(xml_source);
128 Document doc = this.parser.getDocument();
129 return doc;
130
131 } catch (Exception e) {
132 logger.error(e.getMessage(), e);
133
134 }
135 return null;
136 }
137
138 /** returns a DOM document */
139 public Document getDOM(File in, String encoding) {
140 try {
141
142 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
143 InputSource xml_source = new InputSource(isr);
144
145 this.parser.parse(xml_source);
146 Document doc = this.parser.getDocument();
147
148 return doc;
149
150 } catch (Exception e) {
151 logger.error(e.getMessage());
152 }
153 return null;
154 }
155
156
157 /** creates a new empty DOM Document */
158 public Document newDOM() {
159 Document doc = new DocumentImpl();
160 return doc;
161 }
162
163 /** returns the Node as a String */
164 public String getString(Node xmlNode)
165 {
166 outputEscaping = true;
167 StringBuffer xmlRepresentation = new StringBuffer();
168 getString(xmlNode, xmlRepresentation, 0, false);
169 return xmlRepresentation.toString();
170 }
171
172 /** returns the node as a nicely formatted String - this introduces extra
173 * text nodes if the String is read back in as a DOM, so should only be
174 * used for printing */
175 public String getPrettyString(Node xmlNode) {
176
177 outputEscaping = true;
178 StringBuffer xmlRepresentation = new StringBuffer();
179 getString(xmlNode, xmlRepresentation, 0, true);
180 return xmlRepresentation.toString();
181 }
182
183 private void getString(Node xmlNode, StringBuffer xmlRepresentation,
184 int depth, boolean pretty)
185 {
186
187 if (xmlNode == null) {
188 xmlRepresentation.append("<null>");
189 return;
190 }
191
192 short nodeType = xmlNode.getNodeType();
193 String nodeName = xmlNode.getNodeName();
194
195 if (nodeType == Node.DOCUMENT_NODE) {
196 Document xmlDocNode = (Document)xmlNode;
197
198 if (xmlDocNode.getDoctype() == null) {
199 System.err.println("Doctype is null.");
200 }
201 else {
202
203 DocumentType dt = xmlDocNode.getDoctype();
204
205 String name = dt.getName();
206 String pid = dt.getPublicId();
207 String sid = dt.getSystemId();
208
209 String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
210
211 xmlRepresentation.append(doctype_str);
212 }
213 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
214 return;
215 }
216 // Handle Element nodes
217 if (nodeType == Node.ELEMENT_NODE) {
218 if (pretty) {
219 xmlRepresentation.append("\n");
220 for (int i = 0; i < depth; i++) {
221 xmlRepresentation.append(" ");
222 }
223 }
224
225 // Write opening tag
226 xmlRepresentation.append("<");
227 xmlRepresentation.append(nodeName);
228
229 // Write the node attributes
230 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
231 for (int i = 0; i < nodeAttributes.getLength(); i++) {
232 Node attribute = nodeAttributes.item(i);
233 xmlRepresentation.append(" ");
234 xmlRepresentation.append(attribute.getNodeName());
235 xmlRepresentation.append("=\"");
236 xmlRepresentation.append(attribute.getNodeValue());
237 xmlRepresentation.append("\"");
238 }
239
240 // If the node has no children, close the opening tag and return
241 if (xmlNode.hasChildNodes() == false) {
242 // This produces somewhat ugly output, but it is necessary to compensate
243 // for display bugs in Netscape. Firstly, the space is needed before the
244 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
245 // example). Also, a newline character would be expected after the tag,
246 // but this causes problems with the display of links (the link text
247 // will contain a newline character, which is displayed badly).
248 xmlRepresentation.append(" />");
249 return;
250 }
251
252 // Close the opening tag
253 xmlRepresentation.append(">");
254
255 // Apply recursively to the children of this node
256 // hack for nodes next to text nodes - dont make them pretty
257 // this is needed for text inside a <pre> element - any new lines
258 // or spaces around the span elements show up in the text
259 NodeList children = xmlNode.getChildNodes();
260 boolean do_pretty = pretty;
261 for (int i = 0; i < children.getLength(); i++) {
262 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
263 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
264 }
265 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
266 }
267
268 // Write closing tag
269 if (pretty) {
270 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
271 for (int i = 0; i < depth; i++)
272 xmlRepresentation.append(" ");
273 }
274 }
275 xmlRepresentation.append("</");
276 xmlRepresentation.append(nodeName);
277 xmlRepresentation.append(">");
278 if (pretty) {
279 xmlRepresentation.append("\n");
280 }
281 }
282
283 // Handle Text nodes
284 else if (nodeType == Node.TEXT_NODE) {
285 String text = xmlNode.getNodeValue();
286
287 // Perform output escaping, if required
288 if (outputEscaping) {
289 text = text.replaceAll("&", "&amp;"); // Must be done first!!
290 text = text.replaceAll("<", "&lt;");
291 text = text.replaceAll(">", "&gt;");
292 text = text.replaceAll("\"", "&quot;");
293 text = text.replaceAll("\'", "&apos;");
294 }
295
296 // Remove any control-C characters
297 text = text.replaceAll("" + (char) 3, "");
298 xmlRepresentation.append(text);
299 }
300
301 // Handle Processing Instruction nodes
302 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
303 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
304 outputEscaping = false;
305 }
306 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
307 outputEscaping = true;
308 }
309 else {
310 logger.warn("Unhandled processing instruction " + nodeName);
311 }
312 }
313
314 else if (nodeType == Node.COMMENT_NODE) {
315 String text = xmlNode.getNodeValue();
316 xmlRepresentation.append("<!-- ");
317 xmlRepresentation.append(text);
318 xmlRepresentation.append(" -->");
319 }
320
321
322
323 // A type of node that is not handled yet
324 else {
325 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
326 }
327
328 return;
329 }
330
331 protected static String getNodeTypeString(short node_type) {
332
333 String type = "";
334 switch(node_type) {
335 case Node.ATTRIBUTE_NODE:
336 type="ATTRIBUTE_NODE";
337 break;
338 case Node.CDATA_SECTION_NODE:
339 type="CDATA_SECTION_NODE";
340 break;
341 case Node.COMMENT_NODE:
342 type="COMMENT_NODE";
343 break;
344 case Node.DOCUMENT_FRAGMENT_NODE:
345 type="DOCUMENT_FRAGMENT_NODE";
346 break;
347 case Node.DOCUMENT_NODE:
348 type="DOCUMENT_NODE";
349 break;
350 case Node.DOCUMENT_TYPE_NODE:
351 type="DOCUMENT_TYPE_NODE";
352 break;
353 case Node.ELEMENT_NODE:
354 type="ELEMENT_NODE";
355 break;
356 case Node.ENTITY_NODE:
357 type="ENTITY_NODE";
358 break;
359 case Node.ENTITY_REFERENCE_NODE:
360 type="ENTITY_REFERENCE_NODE";
361 break;
362 case Node.NOTATION_NODE:
363 type="NOTATION_NODE";
364 break;
365 case Node.PROCESSING_INSTRUCTION_NODE:
366 type="PROCESSING_INSTRUCTION_NODE";
367 break;
368 case Node.TEXT_NODE:
369 type="TEXT_NODE";
370 break;
371 default:
372 type="UNKNOWN";
373 }
374
375 return type;
376 }
377}
Note: See TracBrowser for help on using the repository browser.