source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java@ 7306

Last change on this file since 7306 was 6344, checked in by cs025, 20 years ago

Improvements/changes to handling of HTML tidying

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.io.File;
4import java.io.FileReader;
5
6import java.net.URL;
7
8import org.w3c.dom.*;
9
10import org.xml.sax.XMLReader;
11import org.xml.sax.InputSource;
12import org.xml.sax.SAXException;
13import org.xml.sax.Attributes;
14import org.xml.sax.helpers.XMLReaderFactory;
15import org.xml.sax.helpers.DefaultHandler;
16import org.xml.sax.ext.LexicalHandler;
17
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
21import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument;
22
23public class HTMLTidy extends DefaultHandler
24{
25 String label;
26 File file;
27 StringBuffer value;
28 boolean inElement;
29 boolean accumulate;
30 Document document;
31 Element currentElement;
32 Element root;
33
34 class HTMLCommentHandler implements LexicalHandler
35 {
36 HTMLTidy parent;
37
38 HTMLCommentHandler(HTMLTidy tidier)
39 { this.parent = tidier;
40 }
41
42 public void comment(char ch[], int start, int length)
43 { String comment = new String(ch, start, length);
44
45 parent.comment(comment);
46 }
47
48 public void endCDATA()
49 {
50 }
51
52 public void endDTD()
53 {
54 }
55
56 public void endEntity(String name)
57 {
58 }
59
60 public void startCDATA()
61 {
62 }
63
64 public void startDTD(String name, String publicId, String systemId)
65 {
66 }
67
68 public void startEntity(String name)
69 {
70 }
71 }
72
73 public HTMLTidy(URL url)
74 { // TODO: complete
75 }
76
77 public HTMLTidy(File file)
78 { this.file = null;
79 this.label = null;
80 this.value = null;
81
82 this.document = new org.apache.html.dom.HTMLDocumentImpl();
83
84 this.currentElement = this.document.getDocumentElement();
85 this.root = this.currentElement;
86
87 try {
88 XMLReader handler = new org.ccil.cowan.tagsoup.Parser();
89 HTMLCommentHandler commentHandler = new HTMLCommentHandler(this);
90 handler.setContentHandler(this);
91 handler.setProperty("http://xml.org/sax/properties/lexical-handler", commentHandler);
92 handler.parse(file.toString());
93 }
94 catch (java.io.IOException io)
95 {
96 }
97 catch (org.xml.sax.SAXException saxEx)
98 {
99 System.out.println(saxEx);
100 }
101 }
102
103 public void startElement(String URI, String localName, String qName, Attributes attributes)
104 { //System.out.println("<"+localName+">");
105
106 if (localName.equals("html") == false) {
107 Element childElement = this.document.createElement(localName);
108
109 this.currentElement.appendChild(childElement);
110 this.currentElement = childElement;
111
112 for (int a = 0; a < attributes.getLength(); a ++)
113 { String name = attributes.getLocalName(a);
114 String value = attributes.getValue(a);
115
116 childElement.setAttribute(name, value);
117 }
118 }
119 }
120
121 public void comment(String text)
122 {
123 Comment comment = document.createComment(text);
124 this.currentElement.appendChild(comment);
125 //System.out.println("<!-- "+text+" -->");
126 }
127
128 public void endElement(String URI, String localName, String qName)
129 { // System.out.println("Ending " + localName);
130 // System.out.println("</"+localName+">");
131
132 if (localName.equals("html")) {
133 return;
134 }
135
136 // TODO: check is the same name as current node
137 if (!this.currentElement.getTagName().equals(localName.toUpperCase())) {
138 // TODO: error handling
139 System.out.println("Bad end at " + localName + " " + this.currentElement.getTagName());
140 return;
141 }
142
143 // Optimisation: is this an empty tag?
144 boolean isEmpty = false;
145
146 if (this.currentElement.getChildNodes().getLength() == 1)
147 { Node child = this.currentElement.getFirstChild();
148
149 if (child.getNodeType() == Node.TEXT_NODE) {
150 String text = child.getNodeValue();
151
152 int i = 0;
153 while (i < text.length())
154 { if (text.charAt(i) > 32 && text.charAt(i) != 160) {
155 break;
156 }
157 i++;
158 }
159 if (text.length() == i) {
160 isEmpty = true;
161 }
162 }
163 }
164 else if (this.currentElement.getChildNodes().getLength() == 0)
165 { isEmpty = true;
166 }
167
168 boolean isSingleton = false;
169 String tagName = this.currentElement.getTagName().toLowerCase();
170
171 if (tagName.equals("img") ||
172 tagName.equals("br") ||
173 tagName.equals("meta") ||
174 tagName.equals("input") ||
175 tagName.equals("area") ||
176 tagName.equals("link") ||
177 tagName.equals("base") ||
178 tagName.equals("img") ||
179 tagName.equals("hr")) {
180 isSingleton = true;
181 }
182
183 Element parent = (Element) this.currentElement.getParentNode();
184
185 if (isEmpty == true && !isSingleton) {
186 // trim the current child
187 parent.removeChild(this.currentElement);
188 // System.out.println("***Removing child***");
189 }
190
191 this.currentElement = parent;
192 }
193
194 public void characters(char c[], int start, int length)
195 { String string = new String(c, start, length);
196 Node text_node = this.document.createTextNode(string);
197 this.currentElement.appendChild(text_node);
198 // System.out.println(string);
199 }
200
201 public void recursePrint(Element element, int indent)
202 {
203 for (int i = 0; i < indent; i++) {
204 System.out.print(" ");
205 }
206 System.out.println(element.toString());
207 for (int c = 0; c < element.getChildNodes().getLength(); c ++)
208 { Node n = element.getChildNodes().item(c);
209
210 if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
211 Element e = (Element) n;
212
213 this.recursePrint(e, indent+2);
214 }
215 }
216 }
217
218 public Document getDocument()
219 {
220 // this.recursePrint(this.root, 0);
221 // System.out.println(this.document.toString());
222 // System.out.println(this.root);
223 return this.document;
224 }
225}
Note: See TracBrowser for help on using the repository browser.