source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.io.File;
4import java.io.FileReader;
5
6import java.net.URL;
7
8import org.w3c.dom.*;
9
10import org.xml.sax.XMLReader;
11import org.xml.sax.InputSource;
12import org.xml.sax.SAXException;
13import org.xml.sax.Attributes;
14import org.xml.sax.helpers.XMLReaderFactory;
15import org.xml.sax.helpers.DefaultHandler;
16import org.xml.sax.ext.LexicalHandler;
17
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
21import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
22
23public class HTMLTidy extends DefaultHandler
24{
25 String label;
26 File file;
27 StringBuffer value;
28 boolean inElement;
29 boolean accumulate;
30 Document document;
31 Element currentElement;
32 Element root;
33
34 class HTMLCommentHandler implements LexicalHandler
35 {
36 HTMLTidy parent;
37
38 HTMLCommentHandler(HTMLTidy tidier)
39 {
40 this.parent = tidier;
41 }
42
43 public void comment(char ch[], int start, int length)
44 {
45 String comment = new String(ch, start, length);
46 parent.comment(comment);
47 }
48
49 public void endCDATA()
50 {
51 }
52
53 public void endDTD()
54 {
55 }
56
57 public void endEntity(String name)
58 {
59 }
60
61 public void startCDATA()
62 {
63 }
64
65 public void startDTD(String name, String publicId, String systemId)
66 {
67 }
68
69 public void startEntity(String name)
70 {
71 }
72 }
73
74 public HTMLTidy(URL url)
75 {
76 // TODO: complete
77 }
78
79 public HTMLTidy(File file)
80 {
81 this.file = null;
82 this.label = null;
83 this.value = null;
84
85 this.document = new org.apache.html.dom.HTMLDocumentImpl();
86
87 this.currentElement = this.document.getDocumentElement();
88 this.root = this.currentElement;
89
90 try {
91 XMLReader handler = new org.ccil.cowan.tagsoup.Parser();
92 HTMLCommentHandler commentHandler = new HTMLCommentHandler(this);
93 handler.setContentHandler(this);
94 handler.setProperty("http://xml.org/sax/properties/lexical-handler", commentHandler);
95 handler.parse(file.toURI().toString());
96 }
97 catch (java.io.IOException io)
98 {
99 System.out.println(io);
100 }
101 catch (org.xml.sax.SAXException saxEx)
102 {
103 System.out.println(saxEx);
104 }
105 }
106
107 public void startElement(String URI, String localName, String qName, Attributes attributes)
108 {
109 //System.out.println("<"+localName+">");
110
111 if (localName.equals("html") == false) {
112 Element childElement = this.document.createElement(localName);
113
114 this.currentElement.appendChild(childElement);
115 this.currentElement = childElement;
116
117 for (int a = 0; a < attributes.getLength(); a ++){
118 String name = attributes.getLocalName(a);
119 String value = attributes.getValue(a);
120
121 childElement.setAttribute(name, value);
122 }
123 }
124 }
125
126 public void comment(String text)
127 {
128 Comment comment = document.createComment(text);
129 this.currentElement.appendChild(comment);
130 //System.out.println("<!-- "+text+" -->");
131 }
132
133 public void endElement(String URI, String localName, String qName)
134 {
135 // System.out.println("Ending " + localName);
136 // System.out.println("</"+localName+">");
137
138 if (localName.equals("html")) {
139 return;
140 }
141
142 // TODO: check is the same name as current node
143 if (!this.currentElement.getTagName().equals(localName.toUpperCase())) {
144 // TODO: error handling
145 System.out.println("Bad end at " + localName + " " + this.currentElement.getTagName());
146 return;
147 }
148
149 // Optimisation: is this an empty tag?
150 boolean isEmpty = false;
151
152 if (this.currentElement.getChildNodes().getLength() == 1){
153 Node child = this.currentElement.getFirstChild();
154
155 if (child.getNodeType() == Node.TEXT_NODE) {
156 String text = child.getNodeValue();
157
158 int i = 0;
159 while (i < text.length()){
160 if (text.charAt(i) > 32 && text.charAt(i) != 160) {
161 break;
162 }
163 i++;
164 }
165 if (text.length() == i) {
166 isEmpty = true;
167 }
168 }
169 }
170 else if (this.currentElement.getChildNodes().getLength() == 0){
171 isEmpty = true;
172 }
173
174 boolean isSingleton = false;
175 String tagName = this.currentElement.getTagName().toLowerCase();
176
177 if (tagName.equals("img") ||
178 tagName.equals("br") ||
179 tagName.equals("meta") ||
180 tagName.equals("input") ||
181 tagName.equals("area") ||
182 tagName.equals("link") ||
183 tagName.equals("base") ||
184 //tagName.equals("img") ||
185 tagName.equals("hr")) {
186 isSingleton = true;
187 }
188
189 Element parent = (Element) this.currentElement.getParentNode();
190
191 if (isEmpty == true && !isSingleton) {
192 // trim the current child
193 parent.removeChild(this.currentElement);
194 // System.out.println("***Removing child***");
195 }
196
197 this.currentElement = parent;
198 }
199
200 public void characters(char c[], int start, int length)
201 {
202 String string = new String(c, start, length);
203 Node text_node = this.document.createTextNode(string);
204 this.currentElement.appendChild(text_node);
205 // System.out.println(string);
206 }
207
208 public void recursePrint(Element element, int indent)
209 {
210 for (int i = 0; i < indent; i++) {
211 System.out.print(" ");
212 }
213 System.out.println(element.toString());
214 for (int c = 0; c < element.getChildNodes().getLength(); c ++){
215 Node n = element.getChildNodes().item(c);
216
217 if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
218 Element e = (Element) n;
219
220 this.recursePrint(e, indent+2);
221 }
222 }
223 }
224
225 public Document getDocument()
226 {
227 // this.recursePrint(this.root, 0);
228 // System.out.println(this.document.toString());
229 // System.out.println(this.root);
230 return this.document;
231 }
232}
Note: See TracBrowser for help on using the repository browser.