source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16688

Last change on this file since 16688 was 16688, checked in by davidb, 16 years ago

Changed 'Element process(Element)' in ModuleInterface to 'Node process(Node)'. After some deliberation is was decided this is a more useful (generic) layer of the DOM to pass information around in. Helps with the DocType problem when producing XSL Transformed pages, for example. When this was an Element, it would loose track of its DocType. Supporting method provided in XMLConverter 'Element nodeToElement(Node)' which checks a nodes docType and casts to Element if appropriate, or if a Document, typecasts to that and then extracts the top-level Element. With this fundamental change in ModuleInterface, around 20 files needed to be updated (Actions, Services, etc) that build on top of 'process()' to reflect this change, and use nodeToElement where necessary.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1/*
2 * XMLConverter.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21// XML classes
22import org.w3c.dom.Document;
23import org.w3c.dom.DocumentType;
24import org.w3c.dom.Element;
25import org.w3c.dom.Node;
26import org.w3c.dom.NodeList;
27import org.w3c.dom.NamedNodeMap;
28import org.xml.sax.InputSource;
29import org.xml.sax.EntityResolver;
30import org.apache.xerces.parsers.DOMParser;
31import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33// other java classes
34import java.io.Reader;
35import java.io.InputStreamReader;
36import java.io.StringReader;
37import java.io.File;
38import java.io.FileInputStream;
39import java.io.FileReader;
40
41import org.apache.log4j.*;
42
43/** XMLConverter - utility class for greenstone
44 *
45 * parses XML Strings into Documents, converts Nodes to Strings
46 * different parsers have different behaviour - can experiment in here
47 * now we only use xerces
48 *
49 * @author <a href="mailto:[email protected]">Katherine Don</a>
50 * @version $Revision: 16688 $
51 *
52 */
53public class XMLConverter {
54
55 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57 /** xerces parser */
58 protected DOMParser parser = null;
59
60 private boolean outputEscaping = true;
61
62
63 /** the no-args constructor */
64 public XMLConverter() {
65 try {
66 this.parser = new DOMParser();
67 this.parser.setFeature("http://xml.org/sax/features/validation", false);
68 // don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69 this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70 // a performance test showed that having this on lead to increased
71 // memory use for small-medium docs, and not much gain for large
72 // docs.
73 // http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74 this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75 } catch (Exception e) {
76 logger.error(e.getMessage());
77 }
78 }
79
80 /** sets the entity resolver. pass in null to unset it */
81 public void setEntityResolver(EntityResolver er) {
82 this.parser.setEntityResolver(er);
83 }
84
85
86 public Element nodeToElement(Node node)
87 {
88 short nodeType = node.getNodeType();
89
90 if (nodeType == Node.DOCUMENT_NODE) {
91 Document docNode = (Document)node;
92 return docNode.getDocumentElement() ;
93 }
94 else if (nodeType == Node.ELEMENT_NODE) {
95 return (Element)node;
96 }
97 else {
98 System.err.println("Expecting Document or Element node type but got " + node.getNodeName());
99 System.err.println("Returning null");
100 return null;
101 }
102 }
103
104 /** returns a DOM Document */
105 public Document getDOM(String in) {
106
107 try {
108 Reader reader = new StringReader(in);
109 InputSource xml_source = new InputSource(reader);
110
111 this.parser.parse(xml_source);
112 Document doc = this.parser.getDocument();
113
114 return doc;
115
116 } catch (Exception e) {
117 logger.error(e.getMessage());
118 }
119 return null;
120 }
121
122 /** returns a DOM Document */
123 public Document getDOM(File in) {
124 try {
125 FileReader reader = new FileReader(in);
126 InputSource xml_source = new InputSource(reader);
127 this.parser.parse(xml_source);
128 Document doc = this.parser.getDocument();
129 return doc;
130
131 } catch (Exception e) {
132 logger.error(e.getMessage(), e);
133
134 }
135 return null;
136 }
137
138 /** returns a DOM document */
139 public Document getDOM(File in, String encoding) {
140 try {
141
142 InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
143 InputSource xml_source = new InputSource(isr);
144
145 this.parser.parse(xml_source);
146 Document doc = this.parser.getDocument();
147
148 return doc;
149
150 } catch (Exception e) {
151 logger.error(e.getMessage());
152 }
153 return null;
154 }
155
156
157 /** creates a new empty DOM Document */
158 public Document newDOM() {
159 Document doc = new DocumentImpl();
160 return doc;
161 }
162
163 /** returns the Node as a String */
164 public String getString(Node xmlNode)
165 {
166 outputEscaping = true;
167 StringBuffer xmlRepresentation = new StringBuffer();
168 getString(xmlNode, xmlRepresentation, 0, false);
169 return xmlRepresentation.toString();
170 }
171
172 /** returns the node as a nicely formatted String - this introduces extra
173 * text nodes if the String is read back in as a DOM, so should only be
174 * used for printing */
175 public String getPrettyString(Node xmlNode) {
176
177 outputEscaping = true;
178 StringBuffer xmlRepresentation = new StringBuffer();
179 getString(xmlNode, xmlRepresentation, 0, true);
180 return xmlRepresentation.toString();
181 }
182
183 private void getString(Node xmlNode, StringBuffer xmlRepresentation,
184 int depth, boolean pretty)
185 {
186
187 if (xmlNode == null) {
188 xmlRepresentation.append("<null>");
189 return;
190 }
191
192 short nodeType = xmlNode.getNodeType();
193 String nodeName = xmlNode.getNodeName();
194
195 if (nodeType == Node.DOCUMENT_NODE) {
196 Document xmlDocNode = (Document)xmlNode;
197
198 if (xmlDocNode.getDoctype() == null) {
199 System.err.println("Doctype is null.");
200 }
201 else {
202
203 DocumentType dt = xmlDocNode.getDoctype();
204
205 String name = dt.getName();
206 String pid = dt.getPublicId();
207 String sid = dt.getSystemId();
208
209 String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\"" + sid + "\">\n";
210
211 xmlRepresentation.append(doctype_str);
212 }
213 getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
214 return;
215 }
216 // Handle Element nodes
217 if (nodeType == Node.ELEMENT_NODE) {
218 if (pretty) {
219 xmlRepresentation.append("\n");
220 for (int i = 0; i < depth; i++) {
221 xmlRepresentation.append(" ");
222 }
223 }
224
225 // Write opening tag
226 xmlRepresentation.append("<");
227 xmlRepresentation.append(nodeName);
228
229 // Write the node attributes
230 NamedNodeMap nodeAttributes = xmlNode.getAttributes();
231 for (int i = 0; i < nodeAttributes.getLength(); i++) {
232 Node attribute = nodeAttributes.item(i);
233 xmlRepresentation.append(" ");
234 xmlRepresentation.append(attribute.getNodeName());
235 xmlRepresentation.append("=\"");
236 xmlRepresentation.append(attribute.getNodeValue());
237 xmlRepresentation.append("\"");
238 }
239
240 // If the node has no children, close the opening tag and return
241 if (xmlNode.hasChildNodes() == false) {
242 // This produces somewhat ugly output, but it is necessary to compensate
243 // for display bugs in Netscape. Firstly, the space is needed before the
244 // closing bracket otherwise Netscape will ignore some tags (<br/>, for
245 // example). Also, a newline character would be expected after the tag,
246 // but this causes problems with the display of links (the link text
247 // will contain a newline character, which is displayed badly).
248 xmlRepresentation.append(" />");
249 return;
250 }
251
252 // Close the opening tag
253 xmlRepresentation.append(">");
254
255 // Apply recursively to the children of this node
256 // hack for nodes next to text nodes - dont make them pretty
257 // this is needed for text inside a <pre> element - any new lines
258 // or spaces around the span elements show up in the text
259 NodeList children = xmlNode.getChildNodes();
260 boolean do_pretty = pretty;
261 for (int i = 0; i < children.getLength(); i++) {
262 if (children.item(i).getNodeType()==Node.TEXT_NODE) {
263 do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
264 }
265 getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
266 }
267
268 // Write closing tag
269 if (pretty) {
270 if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
271 for (int i = 0; i < depth; i++)
272 xmlRepresentation.append(" ");
273 }
274 }
275 xmlRepresentation.append("</");
276 xmlRepresentation.append(nodeName);
277 xmlRepresentation.append(">");
278 if (pretty) {
279 xmlRepresentation.append("\n");
280 }
281 }
282
283 // Handle Text nodes
284 else if (nodeType == Node.TEXT_NODE) {
285 String text = xmlNode.getNodeValue();
286
287 // Perform output escaping, if required
288 if (outputEscaping) {
289 text = text.replaceAll("&", "&amp;"); // Must be done first!!
290 text = text.replaceAll("<", "&lt;");
291 text = text.replaceAll(">", "&gt;");
292 text = text.replaceAll("\"", "&quot;");
293 text = text.replaceAll("\'", "&apos;");
294 }
295
296 // Remove any control-C characters
297 text = text.replaceAll("" + (char) 3, "");
298 xmlRepresentation.append(text);
299 }
300
301 // Handle Processing Instruction nodes
302 else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
303 if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
304 outputEscaping = false;
305 }
306 else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
307 outputEscaping = true;
308 }
309 else {
310 logger.warn("Unhandled processing instruction " + nodeName);
311 }
312 }
313
314 else if (nodeType == Node.COMMENT_NODE) {
315 String text = xmlNode.getNodeValue();
316 xmlRepresentation.append("<!-- ");
317 xmlRepresentation.append(text);
318 xmlRepresentation.append(" -->");
319 }
320
321
322
323 // A type of node that is not handled yet
324 else {
325 logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
326 }
327
328 return;
329 }
330
331 protected static String getNodeTypeString(short node_type) {
332
333 String type = "";
334 switch(node_type) {
335 case Node.ATTRIBUTE_NODE:
336 type="ATTRIBUTE_NODE";
337 break;
338 case Node.CDATA_SECTION_NODE:
339 type="CDATA_SECTION_NODE";
340 break;
341 case Node.COMMENT_NODE:
342 type="COMMENT_NODE";
343 break;
344 case Node.DOCUMENT_FRAGMENT_NODE:
345 type="DOCUMENT_FRAGMENT_NODE";
346 break;
347 case Node.DOCUMENT_NODE:
348 type="DOCUMENT_NODE";
349 break;
350 case Node.DOCUMENT_TYPE_NODE:
351 type="DOCUMENT_TYPE_NODE";
352 break;
353 case Node.ELEMENT_NODE:
354 type="ELEMENT_NODE";
355 break;
356 case Node.ENTITY_NODE:
357 type="ENTITY_NODE";
358 break;
359 case Node.ENTITY_REFERENCE_NODE:
360 type="ENTITY_REFERENCE_NODE";
361 break;
362 case Node.NOTATION_NODE:
363 type="NOTATION_NODE";
364 break;
365 case Node.PROCESSING_INSTRUCTION_NODE:
366 type="PROCESSING_INSTRUCTION_NODE";
367 break;
368 case Node.TEXT_NODE:
369 type="TEXT_NODE";
370 break;
371 default:
372 type="UNKNOWN";
373 }
374
375 return type;
376 }
377}
Note: See TracBrowser for help on using the repository browser.