source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java@ 8710

Last change on this file since 8710 was 8710, checked in by chi, 19 years ago

Change program layout

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.io.File;
4import java.io.FileReader;
5
6import java.net.URL;
7
8import org.w3c.dom.*;
9
10import org.xml.sax.XMLReader;
11import org.xml.sax.InputSource;
12import org.xml.sax.SAXException;
13import org.xml.sax.Attributes;
14import org.xml.sax.helpers.XMLReaderFactory;
15import org.xml.sax.helpers.DefaultHandler;
16import org.xml.sax.ext.LexicalHandler;
17
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
21import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
22
23public class HTMLTidy extends DefaultHandler
24{
25 String label;
26 File file;
27 StringBuffer value;
28 boolean inElement;
29 boolean accumulate;
30 Document document;
31 Element currentElement;
32 Element root;
33
34 class HTMLCommentHandler implements LexicalHandler
35 {
36 HTMLTidy parent;
37
38 HTMLCommentHandler(HTMLTidy tidier)
39 {
40 this.parent = tidier;
41 }
42
43 public void comment(char ch[], int start, int length)
44 {
45 String comment = new String(ch, start, length);
46 parent.comment(comment);
47 }
48
49 public void endCDATA()
50 {
51 }
52
53 public void endDTD()
54 {
55 }
56
57 public void endEntity(String name)
58 {
59 }
60
61 public void startCDATA()
62 {
63 }
64
65 public void startDTD(String name, String publicId, String systemId)
66 {
67 }
68
69 public void startEntity(String name)
70 {
71 }
72 }
73
74 public HTMLTidy(URL url)
75 {
76 // TODO: complete
77 }
78
79 public HTMLTidy(File file)
80 {
81 this.file = null;
82 this.label = null;
83 this.value = null;
84
85 this.document = new org.apache.html.dom.HTMLDocumentImpl();
86
87 this.currentElement = this.document.getDocumentElement();
88 this.root = this.currentElement;
89
90 try {
91 XMLReader handler = new org.ccil.cowan.tagsoup.Parser();
92 HTMLCommentHandler commentHandler = new HTMLCommentHandler(this);
93 handler.setContentHandler(this);
94 handler.setProperty("http://xml.org/sax/properties/lexical-handler", commentHandler);
95 handler.parse(file.toString());
96 }
97 catch (java.io.IOException io)
98 {
99 }
100 catch (org.xml.sax.SAXException saxEx)
101 {
102 System.out.println(saxEx);
103 }
104 }
105
106 public void startElement(String URI, String localName, String qName, Attributes attributes)
107 {
108 //System.out.println("<"+localName+">");
109
110 if (localName.equals("html") == false) {
111 Element childElement = this.document.createElement(localName);
112
113 this.currentElement.appendChild(childElement);
114 this.currentElement = childElement;
115
116 for (int a = 0; a < attributes.getLength(); a ++){
117 String name = attributes.getLocalName(a);
118 String value = attributes.getValue(a);
119
120 childElement.setAttribute(name, value);
121 }
122 }
123 }
124
125 public void comment(String text)
126 {
127 Comment comment = document.createComment(text);
128 this.currentElement.appendChild(comment);
129 //System.out.println("<!-- "+text+" -->");
130 }
131
132 public void endElement(String URI, String localName, String qName)
133 {
134 // System.out.println("Ending " + localName);
135 // System.out.println("</"+localName+">");
136
137 if (localName.equals("html")) {
138 return;
139 }
140
141 // TODO: check is the same name as current node
142 if (!this.currentElement.getTagName().equals(localName.toUpperCase())) {
143 // TODO: error handling
144 System.out.println("Bad end at " + localName + " " + this.currentElement.getTagName());
145 return;
146 }
147
148 // Optimisation: is this an empty tag?
149 boolean isEmpty = false;
150
151 if (this.currentElement.getChildNodes().getLength() == 1){
152 Node child = this.currentElement.getFirstChild();
153
154 if (child.getNodeType() == Node.TEXT_NODE) {
155 String text = child.getNodeValue();
156
157 int i = 0;
158 while (i < text.length()){
159 if (text.charAt(i) > 32 && text.charAt(i) != 160) {
160 break;
161 }
162 i++;
163 }
164 if (text.length() == i) {
165 isEmpty = true;
166 }
167 }
168 }
169 else if (this.currentElement.getChildNodes().getLength() == 0){
170 isEmpty = true;
171 }
172
173 boolean isSingleton = false;
174 String tagName = this.currentElement.getTagName().toLowerCase();
175
176 if (tagName.equals("img") ||
177 tagName.equals("br") ||
178 tagName.equals("meta") ||
179 tagName.equals("input") ||
180 tagName.equals("area") ||
181 tagName.equals("link") ||
182 tagName.equals("base") ||
183 //tagName.equals("img") ||
184 tagName.equals("hr")) {
185 isSingleton = true;
186 }
187
188 Element parent = (Element) this.currentElement.getParentNode();
189
190 if (isEmpty == true && !isSingleton) {
191 // trim the current child
192 parent.removeChild(this.currentElement);
193 // System.out.println("***Removing child***");
194 }
195
196 this.currentElement = parent;
197 }
198
199 public void characters(char c[], int start, int length)
200 {
201 String string = new String(c, start, length);
202 Node text_node = this.document.createTextNode(string);
203 this.currentElement.appendChild(text_node);
204 // System.out.println(string);
205 }
206
207 public void recursePrint(Element element, int indent)
208 {
209 for (int i = 0; i < indent; i++) {
210 System.out.print(" ");
211 }
212 System.out.println(element.toString());
213 for (int c = 0; c < element.getChildNodes().getLength(); c ++){
214 Node n = element.getChildNodes().item(c);
215
216 if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
217 Element e = (Element) n;
218
219 this.recursePrint(e, indent+2);
220 }
221 }
222 }
223
224 public Document getDocument()
225 {
226 // this.recursePrint(this.root, 0);
227 // System.out.println(this.document.toString());
228 // System.out.println(this.root);
229 return this.document;
230 }
231}
Note: See TracBrowser for help on using the repository browser.