source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 6284

Last change on this file since 6284 was 6284, checked in by cs025, 20 years ago

Added HTMLDocumentTools, also modifications to the abstract interfaces
and the HTMLDocument doctype to support indexing by section.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1package org.greenstone.gsdl3.gs3build.doctypes;
2
3import java.io.File;
4import java.net.URL;
5import java.net.MalformedURLException;
6
7import java.util.List;
8import java.util.Map;
9import java.util.ArrayList;
10
11import org.greenstone.gsdl3.gs3build.metadata.*;
12import org.greenstone.gsdl3.gs3build.util.*;
13import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
14
15import org.w3c.dom.*;
16
17public class HTMLDocument extends AbstractDocument
18{
19 public static final String HTML_DOCUMENT_TYPE = "HTML";
20 Document domDocument;
21
22 public HTMLDocument(DocumentID id)
23 { super(id);
24 this.domDocument = null;
25 }
26
27 /**
28 * Create the HTMLDocument from a given URL - the URL may in fact be a reference
29 * to a local file.
30 *
31 * @param <code>URL</code> The location from which to load the file
32 */
33 public HTMLDocument(URL url)
34 { super(url);
35
36 this.loadDocument(url);
37
38 this._extractDocumentFiles();
39 this._extractDocumentMetadata();
40
41 HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument);
42 docTools.setMetsDocument(this);
43 docTools.setUrl(this.fileSet.getFile(0).getLocation());
44 docTools.findSections();
45 METSStructure sectionStruct = docTools.getStructure();
46
47 if (sectionStruct.size() > 0) {
48 METSStructureSet structureSet = this.getDocumentStructure();
49 structureSet.addStructure(sectionStruct);
50 }
51 }
52
53 private void loadDocument(URL url)
54 {
55 // HTMLDoc htmlDoc;
56 HTMLTidy tidyDoc;
57 if (url.toString().startsWith("file://"))
58 { //htmlDoc = new HTMLDoc(url, url.toString().substring(7));
59 tidyDoc = new HTMLTidy(new File(url.toString().substring(7)));
60 }
61 else if (url.toString().startsWith("file:/"))
62 { //htmlDoc = new HTMLDoc(url, url.toString().substring(5));
63 tidyDoc = new HTMLTidy(new File( url.toString().substring(5)));
64 }
65 else
66 { //htmlDoc = new HTMLDoc(url);
67 tidyDoc = new HTMLTidy(url);
68 }
69
70 this.domDocument = tidyDoc.getDocument();
71 }
72
73 private void _extractDocumentMetadata()
74 {
75 NodeList metadata = this.domDocument.getElementsByTagName("META");
76 for (int n = 0; n < metadata.getLength(); n ++) {
77 Node node = metadata.item(n);
78 Element element = (Element) node;
79
80 String name = element.getAttribute("name");
81 if (name == null || name.length() == 0) {
82 continue;
83 }
84
85 String value = element.getAttribute("content");
86 if (value == null || value.length() == 0) {
87 value = name;
88 }
89
90 this.addDocumentMetadata(new MetadataLabel(name), value);
91 }
92
93 NodeList titles = this.domDocument.getElementsByTagName("TITLE");
94 StringBuffer title = new StringBuffer();
95 for (int n = 0; n < titles.getLength(); n ++) {
96 Node node = titles.item(n);
97 Element element = (Element) node;
98
99 NodeList childNodes = node.getChildNodes();
100 for (int c = 0; c < childNodes.getLength(); c ++) {
101 Node child = childNodes.item(c);
102 if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
103 title.append(child.getNodeValue());
104 }
105 }
106 }
107
108 if (title.length() > 0)
109 { this.addDocumentMetadata(new MetadataLabel("title"), title.toString());
110 }
111
112 /* Old HTMLDoc based parsing...
113 HTMLBlock codedContent = htmlDoc.getCodedContent();
114 boolean inTitle = false;
115 title = new StringBuffer();
116
117 for (int e = 0; e < codedContent.size(); e ++)
118 { if (codedContent.elementAt(e) instanceof HTMLTag)
119 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
120
121 if (tag.tagName().equals("meta"))
122 { // check that the name of the metadata item exists
123 String name = tag.idValue("name");
124 if (name == null || name.length() == 0) {
125 continue;
126 }
127
128 // get the value, if it exists
129 String value = tag.idValue("content");
130
131 // if value does not exist, default it to being the same
132 // as the name.
133 if (value == null || value.length() > 0) {
134 value = name;
135 }
136 }
137 else
138 if (tag.tagName().equals("title"))
139 { inTitle = true;
140 }
141 else if (tag.tagName().equals("/title"))
142 { inTitle = false;
143 }
144 // cut off when real body content appears - not a perfect
145 // implementation, just cheap & cheerful
146 else if (tag.tagName().equals("/head"))
147 { break;
148 }
149 else if (tag.tagName().equals("body"))
150 { break;
151 }
152 }
153 else if (inTitle == true)
154 { title.append(codedContent.elementAt(e).toString());
155 }
156 }
157 */
158 }
159
160 private void _extractDocumentFiles()
161 { URL homeUrl = this.fileSet.getFile(0).getLocation();
162
163 NodeList metadata = this.domDocument.getElementsByTagName("img");
164 for (int n = 0; n < metadata.getLength(); n ++) {
165 Node node = metadata.item(n);
166 Element element = (Element) node;
167
168 String location = element.getAttribute("src");
169 if (location == null || location.length() == 0) {
170 System.out.println("No name");
171 continue;
172 }
173
174 try
175 { // make the url for the image, and then add it to the document list of
176 //
177 URL imgUrl = new URL(homeUrl, location);
178 METSFile file = this.fileSet.addFile(imgUrl);
179 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
180 }
181 catch (MalformedURLException ex)
182 { // TODO: report exception/failure to resolve...
183 }
184 }
185
186 /**
187 HTMLBlock codedContent = htmlDoc.getCodedContent();
188 for (int e = 0; e < codedContent.size(); e ++)
189 { if (codedContent.elementAt(e) instanceof HTMLTag)
190 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
191
192 if (tag.tagName().equals("img"))
193 { String location = tag.idValue("src");
194
195 try
196 { // make the url for the image, and then add it to the document list of
197 //
198 URL imgUrl = new URL(homeUrl, location);
199 METSFile file = this.fileSet.addFile(imgUrl);
200 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
201 }
202 catch (MalformedURLException ex)
203 { // TODO: report exception/failure to resolve...
204 }
205 }
206 }
207 }
208 */
209 }
210
211 public String getDocumentType()
212 { return HTML_DOCUMENT_TYPE;
213 }
214
215 public String getDocumentText()
216 {
217 HTMLDoc htmlDoc;
218 URL url =(URL) this.fileSet.getFile(0).getLocation();
219
220 this.getSectionText("1");
221
222 if (url.toString().startsWith("file://"))
223 { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
224 }
225 else if (url.toString().startsWith("file:/"))
226 { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
227 }
228 else
229 { htmlDoc = new HTMLDoc(url);
230 }
231 return htmlDoc.getContent();
232 }
233
234 public Document getDOMDocument()
235 {
236 if (this.domDocument == null) {
237 URL url =(URL) this.fileSet.getFile(0).getLocation();
238 this.loadDocument(url);
239 }
240 return this.domDocument;
241 }
242
243 private XPointer getXPointer(METSDivision division)
244 { String fileId = division.getDefaultFileReference();
245
246 if (fileId == null) {
247 System.err.println("Unable to obtain file reference for section " + division.getLabel());
248 return null;
249 }
250
251 METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId);
252 if (fileGroup == null) {
253 System.err.println("Unable to obtain file reference for filegroup " + fileId);
254 return null;
255 }
256
257 METSFile file = fileGroup.getFile(0);
258 if (file == null) {
259 System.err.println("Unable to obtain any files within filegroup " + fileId);
260 return null;
261 }
262
263 URL url = file.getLocation();
264 XPointer xpointer = XPointer.processXPointer(this.domDocument, url);
265
266 return xpointer;
267 }
268
269 private XPointer getXPointer(String sectionId)
270 { if (this.domDocument == null) {
271 URL url =(URL) this.fileSet.getFile(0).getLocation();
272 this.loadDocument(url);
273 }
274
275 METSDivision division =
276 this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE,
277 sectionId);
278 if (division == null) {
279 // System.err.println("Unable to locate section " + sectionId);
280 return null;
281 }
282
283 return this.getXPointer(division);
284 }
285
286 public Node getSectionStartNode(METSDivision division)
287 { XPointer xpointer = this.getXPointer(division);
288 return xpointer.getStartNode();
289 }
290
291 public Node getSectionStartNode(String sectionId)
292 { XPointer xpointer = this.getXPointer(sectionId);
293
294 return xpointer.getStartNode();
295 }
296
297 public String getSectionText(String sectionId)
298 { XPointer xpointer = this.getXPointer(sectionId);
299
300 if (xpointer == null) {
301 return "";
302 }
303
304 // get the XML content of the xpointer...
305 return xpointer.toString();
306 }
307}
Note: See TracBrowser for help on using the repository browser.