1 | package org.greenstone.gsdl3.gs3build.util;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.io.FileReader;
|
---|
5 |
|
---|
6 | import java.net.URL;
|
---|
7 |
|
---|
8 | import org.w3c.dom.*;
|
---|
9 |
|
---|
10 | import org.xml.sax.XMLReader;
|
---|
11 | import org.xml.sax.InputSource;
|
---|
12 | import org.xml.sax.SAXException;
|
---|
13 | import org.xml.sax.Attributes;
|
---|
14 | import org.xml.sax.helpers.XMLReaderFactory;
|
---|
15 | import org.xml.sax.helpers.DefaultHandler;
|
---|
16 | import org.xml.sax.ext.LexicalHandler;
|
---|
17 |
|
---|
18 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
19 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
20 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
|
---|
21 | import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
|
---|
22 |
|
---|
23 | public class HTMLTidy extends DefaultHandler
|
---|
24 | {
|
---|
25 | String label;
|
---|
26 | File file;
|
---|
27 | StringBuffer value;
|
---|
28 | boolean inElement;
|
---|
29 | boolean accumulate;
|
---|
30 | Document document;
|
---|
31 | Element currentElement;
|
---|
32 | Element root;
|
---|
33 |
|
---|
34 | class HTMLCommentHandler implements LexicalHandler
|
---|
35 | {
|
---|
36 | HTMLTidy parent;
|
---|
37 |
|
---|
38 | HTMLCommentHandler(HTMLTidy tidier)
|
---|
39 | {
|
---|
40 | this.parent = tidier;
|
---|
41 | }
|
---|
42 |
|
---|
43 | public void comment(char ch[], int start, int length)
|
---|
44 | {
|
---|
45 | String comment = new String(ch, start, length);
|
---|
46 | parent.comment(comment);
|
---|
47 | }
|
---|
48 |
|
---|
49 | public void endCDATA()
|
---|
50 | {
|
---|
51 | }
|
---|
52 |
|
---|
53 | public void endDTD()
|
---|
54 | {
|
---|
55 | }
|
---|
56 |
|
---|
57 | public void endEntity(String name)
|
---|
58 | {
|
---|
59 | }
|
---|
60 |
|
---|
61 | public void startCDATA()
|
---|
62 | {
|
---|
63 | }
|
---|
64 |
|
---|
65 | public void startDTD(String name, String publicId, String systemId)
|
---|
66 | {
|
---|
67 | }
|
---|
68 |
|
---|
69 | public void startEntity(String name)
|
---|
70 | {
|
---|
71 | }
|
---|
72 | }
|
---|
73 |
|
---|
74 | public HTMLTidy(URL url)
|
---|
75 | {
|
---|
76 | // TODO: complete
|
---|
77 | }
|
---|
78 |
|
---|
79 | public HTMLTidy(File file)
|
---|
80 | {
|
---|
81 | this.file = null;
|
---|
82 | this.label = null;
|
---|
83 | this.value = null;
|
---|
84 |
|
---|
85 | this.document = new org.apache.html.dom.HTMLDocumentImpl();
|
---|
86 |
|
---|
87 | this.currentElement = this.document.getDocumentElement();
|
---|
88 | this.root = this.currentElement;
|
---|
89 |
|
---|
90 | try {
|
---|
91 | XMLReader handler = new org.ccil.cowan.tagsoup.Parser();
|
---|
92 | HTMLCommentHandler commentHandler = new HTMLCommentHandler(this);
|
---|
93 | handler.setContentHandler(this);
|
---|
94 | handler.setProperty("http://xml.org/sax/properties/lexical-handler", commentHandler);
|
---|
95 | handler.parse(file.toURI().toString());
|
---|
96 | }
|
---|
97 | catch (java.io.IOException io)
|
---|
98 | {
|
---|
99 | System.out.println(io);
|
---|
100 | }
|
---|
101 | catch (org.xml.sax.SAXException saxEx)
|
---|
102 | {
|
---|
103 | System.out.println(saxEx);
|
---|
104 | }
|
---|
105 | }
|
---|
106 |
|
---|
107 | public void startElement(String URI, String localName, String qName, Attributes attributes)
|
---|
108 | {
|
---|
109 | //System.out.println("<"+localName+">");
|
---|
110 |
|
---|
111 | if (localName.equals("html") == false) {
|
---|
112 | Element childElement = this.document.createElement(localName);
|
---|
113 |
|
---|
114 | this.currentElement.appendChild(childElement);
|
---|
115 | this.currentElement = childElement;
|
---|
116 |
|
---|
117 | for (int a = 0; a < attributes.getLength(); a ++){
|
---|
118 | String name = attributes.getLocalName(a);
|
---|
119 | String value = attributes.getValue(a);
|
---|
120 |
|
---|
121 | childElement.setAttribute(name, value);
|
---|
122 | }
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | public void comment(String text)
|
---|
127 | {
|
---|
128 | Comment comment = document.createComment(text);
|
---|
129 | this.currentElement.appendChild(comment);
|
---|
130 | //System.out.println("<!-- "+text+" -->");
|
---|
131 | }
|
---|
132 |
|
---|
133 | public void endElement(String URI, String localName, String qName)
|
---|
134 | {
|
---|
135 | // System.out.println("Ending " + localName);
|
---|
136 | // System.out.println("</"+localName+">");
|
---|
137 |
|
---|
138 | if (localName.equals("html")) {
|
---|
139 | return;
|
---|
140 | }
|
---|
141 |
|
---|
142 | // TODO: check is the same name as current node
|
---|
143 | if (!this.currentElement.getTagName().equals(localName.toUpperCase())) {
|
---|
144 | // TODO: error handling
|
---|
145 | System.out.println("Bad end at " + localName + " " + this.currentElement.getTagName());
|
---|
146 | return;
|
---|
147 | }
|
---|
148 |
|
---|
149 | // Optimisation: is this an empty tag?
|
---|
150 | boolean isEmpty = false;
|
---|
151 |
|
---|
152 | if (this.currentElement.getChildNodes().getLength() == 1){
|
---|
153 | Node child = this.currentElement.getFirstChild();
|
---|
154 |
|
---|
155 | if (child.getNodeType() == Node.TEXT_NODE) {
|
---|
156 | String text = child.getNodeValue();
|
---|
157 |
|
---|
158 | int i = 0;
|
---|
159 | while (i < text.length()){
|
---|
160 | if (text.charAt(i) > 32 && text.charAt(i) != 160) {
|
---|
161 | break;
|
---|
162 | }
|
---|
163 | i++;
|
---|
164 | }
|
---|
165 | if (text.length() == i) {
|
---|
166 | isEmpty = true;
|
---|
167 | }
|
---|
168 | }
|
---|
169 | }
|
---|
170 | else if (this.currentElement.getChildNodes().getLength() == 0){
|
---|
171 | isEmpty = true;
|
---|
172 | }
|
---|
173 |
|
---|
174 | boolean isSingleton = false;
|
---|
175 | String tagName = this.currentElement.getTagName().toLowerCase();
|
---|
176 |
|
---|
177 | if (tagName.equals("img") ||
|
---|
178 | tagName.equals("br") ||
|
---|
179 | tagName.equals("meta") ||
|
---|
180 | tagName.equals("input") ||
|
---|
181 | tagName.equals("area") ||
|
---|
182 | tagName.equals("link") ||
|
---|
183 | tagName.equals("base") ||
|
---|
184 | //tagName.equals("img") ||
|
---|
185 | tagName.equals("hr")) {
|
---|
186 | isSingleton = true;
|
---|
187 | }
|
---|
188 |
|
---|
189 | Element parent = (Element) this.currentElement.getParentNode();
|
---|
190 |
|
---|
191 | if (isEmpty == true && !isSingleton) {
|
---|
192 | // trim the current child
|
---|
193 | parent.removeChild(this.currentElement);
|
---|
194 | // System.out.println("***Removing child***");
|
---|
195 | }
|
---|
196 |
|
---|
197 | this.currentElement = parent;
|
---|
198 | }
|
---|
199 |
|
---|
200 | public void characters(char c[], int start, int length)
|
---|
201 | {
|
---|
202 | String string = new String(c, start, length);
|
---|
203 | Node text_node = this.document.createTextNode(string);
|
---|
204 | this.currentElement.appendChild(text_node);
|
---|
205 | // System.out.println(string);
|
---|
206 | }
|
---|
207 |
|
---|
208 | public void recursePrint(Element element, int indent)
|
---|
209 | {
|
---|
210 | for (int i = 0; i < indent; i++) {
|
---|
211 | System.out.print(" ");
|
---|
212 | }
|
---|
213 | System.out.println(element.toString());
|
---|
214 | for (int c = 0; c < element.getChildNodes().getLength(); c ++){
|
---|
215 | Node n = element.getChildNodes().item(c);
|
---|
216 |
|
---|
217 | if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
|
---|
218 | Element e = (Element) n;
|
---|
219 |
|
---|
220 | this.recursePrint(e, indent+2);
|
---|
221 | }
|
---|
222 | }
|
---|
223 | }
|
---|
224 |
|
---|
225 | public Document getDocument()
|
---|
226 | {
|
---|
227 | // this.recursePrint(this.root, 0);
|
---|
228 | // System.out.println(this.document.toString());
|
---|
229 | // System.out.println(this.root);
|
---|
230 | return this.document;
|
---|
231 | }
|
---|
232 | }
|
---|