source: branches/ant-install-branch/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java@ 9845

Last change on this file since 9845 was 9845, checked in by kjdon, 19 years ago

in the handler.parse call replaced file.toString() with file.toURI().toString() cos the former didn't work on windows

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.io.File;
4import java.io.FileReader;
5
6import java.net.URL;
7
8import org.w3c.dom.*;
9
10import org.xml.sax.XMLReader;
11import org.xml.sax.InputSource;
12import org.xml.sax.SAXException;
13import org.xml.sax.Attributes;
14import org.xml.sax.helpers.XMLReaderFactory;
15import org.xml.sax.helpers.DefaultHandler;
16import org.xml.sax.ext.LexicalHandler;
17
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
21import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
22
23public class HTMLTidy extends DefaultHandler
24{
25 String label;
26 File file;
27 StringBuffer value;
28 boolean inElement;
29 boolean accumulate;
30 Document document;
31 Element currentElement;
32 Element root;
33
34 class HTMLCommentHandler implements LexicalHandler
35 {
36 HTMLTidy parent;
37
38 HTMLCommentHandler(HTMLTidy tidier)
39 {
40 this.parent = tidier;
41 }
42
43 public void comment(char ch[], int start, int length)
44 {
45 String comment = new String(ch, start, length);
46 parent.comment(comment);
47 }
48
49 public void endCDATA()
50 {
51 }
52
53 public void endDTD()
54 {
55 }
56
57 public void endEntity(String name)
58 {
59 }
60
61 public void startCDATA()
62 {
63 }
64
65 public void startDTD(String name, String publicId, String systemId)
66 {
67 }
68
69 public void startEntity(String name)
70 {
71 }
72 }
73
74 public HTMLTidy(URL url)
75 {
76 // TODO: complete
77 }
78
79 public HTMLTidy(File file)
80 {
81 this.file = null;
82 this.label = null;
83 this.value = null;
84
85 this.document = new org.apache.html.dom.HTMLDocumentImpl();
86
87 this.currentElement = this.document.getDocumentElement();
88 this.root = this.currentElement;
89
90 try {
91 XMLReader handler = new org.ccil.cowan.tagsoup.Parser();
92 HTMLCommentHandler commentHandler = new HTMLCommentHandler(this);
93 handler.setContentHandler(this);
94 handler.setProperty("http://xml.org/sax/properties/lexical-handler", commentHandler);
95 handler.parse(file.toURI().toString());
96 }
97 catch (java.io.IOException io)
98 {
99 System.out.println(io);
100 }
101 catch (org.xml.sax.SAXException saxEx)
102 {
103 System.out.println(saxEx);
104 }
105 }
106
107 public void startElement(String URI, String localName, String qName, Attributes attributes)
108 {
109 //System.out.println("<"+localName+">");
110
111 if (localName.equals("html") == false) {
112 Element childElement = this.document.createElement(localName);
113
114 this.currentElement.appendChild(childElement);
115 this.currentElement = childElement;
116
117 for (int a = 0; a < attributes.getLength(); a ++){
118 String name = attributes.getLocalName(a);
119 String value = attributes.getValue(a);
120
121 childElement.setAttribute(name, value);
122 }
123 }
124 }
125
126 public void comment(String text)
127 {
128 Comment comment = document.createComment(text);
129 this.currentElement.appendChild(comment);
130 //System.out.println("<!-- "+text+" -->");
131 }
132
133 public void endElement(String URI, String localName, String qName)
134 {
135 // System.out.println("Ending " + localName);
136 // System.out.println("</"+localName+">");
137
138 if (localName.equals("html")) {
139 return;
140 }
141
142 // TODO: check is the same name as current node
143 if (!this.currentElement.getTagName().equals(localName.toUpperCase())) {
144 // TODO: error handling
145 System.out.println("Bad end at " + localName + " " + this.currentElement.getTagName());
146 return;
147 }
148
149 // Optimisation: is this an empty tag?
150 boolean isEmpty = false;
151
152 if (this.currentElement.getChildNodes().getLength() == 1){
153 Node child = this.currentElement.getFirstChild();
154
155 if (child.getNodeType() == Node.TEXT_NODE) {
156 String text = child.getNodeValue();
157
158 int i = 0;
159 while (i < text.length()){
160 if (text.charAt(i) > 32 && text.charAt(i) != 160) {
161 break;
162 }
163 i++;
164 }
165 if (text.length() == i) {
166 isEmpty = true;
167 }
168 }
169 }
170 else if (this.currentElement.getChildNodes().getLength() == 0){
171 isEmpty = true;
172 }
173
174 boolean isSingleton = false;
175 String tagName = this.currentElement.getTagName().toLowerCase();
176
177 if (tagName.equals("img") ||
178 tagName.equals("br") ||
179 tagName.equals("meta") ||
180 tagName.equals("input") ||
181 tagName.equals("area") ||
182 tagName.equals("link") ||
183 tagName.equals("base") ||
184 //tagName.equals("img") ||
185 tagName.equals("hr")) {
186 isSingleton = true;
187 }
188
189 Element parent = (Element) this.currentElement.getParentNode();
190
191 if (isEmpty == true && !isSingleton) {
192 // trim the current child
193 parent.removeChild(this.currentElement);
194 // System.out.println("***Removing child***");
195 }
196
197 this.currentElement = parent;
198 }
199
200 public void characters(char c[], int start, int length)
201 {
202 String string = new String(c, start, length);
203 Node text_node = this.document.createTextNode(string);
204 this.currentElement.appendChild(text_node);
205 // System.out.println(string);
206 }
207
208 public void recursePrint(Element element, int indent)
209 {
210 for (int i = 0; i < indent; i++) {
211 System.out.print(" ");
212 }
213 System.out.println(element.toString());
214 for (int c = 0; c < element.getChildNodes().getLength(); c ++){
215 Node n = element.getChildNodes().item(c);
216
217 if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
218 Element e = (Element) n;
219
220 this.recursePrint(e, indent+2);
221 }
222 }
223 }
224
225 public Document getDocument()
226 {
227 // this.recursePrint(this.root, 0);
228 // System.out.println(this.document.toString());
229 // System.out.println(this.root);
230 return this.document;
231 }
232}
Note: See TracBrowser for help on using the repository browser.