source: tags/ant-install-branch-merged-1/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 9873

Last change on this file since 9873 was 9873, checked in by (none), 19 years ago

This commit was manufactured by cvs2svn to create tag
'ant-install-branch-merged-1'.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1package org.greenstone.gsdl3.gs3build.doctypes;
2
3import java.io.File;
4import java.net.URL;
5import java.net.MalformedURLException;
6
7import java.util.Iterator;
8import java.util.List;
9import java.util.Map;
10import java.util.ArrayList;
11
12import org.greenstone.gsdl3.gs3build.metadata.*;
13import org.greenstone.gsdl3.gs3build.util.*;
14import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
15import org.greenstone.gsdl3.util.XMLConverter;
16
17import org.w3c.dom.*;
18
19public class HTMLDocument extends AbstractDocument
20{
21 public static final String HTML_DOCUMENT_TYPE = "HTML";
22 Document domDocument;
23
24 public HTMLDocument(DocumentID id)
25 { super(id);
26 this.domDocument = null;
27 }
28
29 /**
30 * Create the HTMLDocument from a given URL - the URL may in fact be a reference
31 * to a local file.
32 *
33 * @param <code>URL</code> The location from which to load the file
34 */
35 public HTMLDocument(URL url)
36 { super(url);
37
38 this.loadDocument(url);
39
40 this._extractDocumentFiles();
41 this._extractDocumentMetadata();
42 // extract the files before resolving the images
43 resolveImages();
44 HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument);
45 docTools.setMetsDocument(this);
46 docTools.setUrl(this.fileSet.getFile(0).getLocation());
47 docTools.findSections();
48 METSStructure sectionStruct = docTools.getStructure();
49
50 if (sectionStruct.size() > 0) {
51 METSStructureSet structureSet = this.getDocumentStructure();
52 structureSet.addStructure(sectionStruct);
53 }
54 }
55
56 private void loadDocument(URL url)
57 {
58 // HTMLDoc htmlDoc;
59
60 HTMLTidy tidyDoc;
61 if (url.getProtocol().equals("file"))
62 { tidyDoc = new HTMLTidy(new File(url.getPath()));
63 }
64 else
65 { //htmlDoc = new HTMLDoc(url);
66 tidyDoc = new HTMLTidy(url);
67 }
68
69 // System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().totalMemory());
70
71 this.domDocument = tidyDoc.getDocument();
72 Runtime.getRuntime().gc();
73 // System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().totalMemory());
74 }
75
76 private void _extractDocumentMetadata()
77 {
78 NodeList metadata = this.domDocument.getElementsByTagName("META");
79 for (int n = 0; n < metadata.getLength(); n ++) {
80 Node node = metadata.item(n);
81 Element element = (Element) node;
82
83 String name = element.getAttribute("name");
84 if (name == null || name.length() == 0) {
85 continue;
86 }
87
88 String value = element.getAttribute("content");
89 if (value == null || value.length() == 0) {
90 value = name;
91 }
92
93 this.addDocumentMetadata(new MetadataLabel(name), value);
94 }
95
96 NodeList titles = this.domDocument.getElementsByTagName("TITLE");
97 StringBuffer title = new StringBuffer();
98 for (int n = 0; n < titles.getLength(); n ++) {
99 Node node = titles.item(n);
100 Element element = (Element) node;
101
102 NodeList childNodes = node.getChildNodes();
103 for (int c = 0; c < childNodes.getLength(); c ++) {
104 Node child = childNodes.item(c);
105 if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
106 title.append(child.getNodeValue());
107 }
108 }
109 }
110
111 if (title.length() > 0)
112 { this.addDocumentMetadata(new MetadataLabel("Title"), title.toString());
113 }
114
115 /* Old HTMLDoc based parsing...
116 HTMLBlock codedContent = htmlDoc.getCodedContent();
117 boolean inTitle = false;
118 title = new StringBuffer();
119
120 for (int e = 0; e < codedContent.size(); e ++)
121 { if (codedContent.elementAt(e) instanceof HTMLTag)
122 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
123
124 if (tag.tagName().equals("meta"))
125 { // check that the name of the metadata item exists
126 String name = tag.idValue("name");
127 if (name == null || name.length() == 0) {
128 continue;
129 }
130
131 // get the value, if it exists
132 String value = tag.idValue("content");
133
134 // if value does not exist, default it to being the same
135 // as the name.
136 if (value == null || value.length() > 0) {
137 value = name;
138 }
139 }
140 else
141 if (tag.tagName().equals("title"))
142 { inTitle = true;
143 }
144 else if (tag.tagName().equals("/title"))
145 { inTitle = false;
146 }
147 // cut off when real body content appears - not a perfect
148 // implementation, just cheap & cheerful
149 else if (tag.tagName().equals("/head"))
150 { break;
151 }
152 else if (tag.tagName().equals("body"))
153 { break;
154 }
155 }
156 else if (inTitle == true)
157 { title.append(codedContent.elementAt(e).toString());
158 }
159 }
160 */
161 }
162
163 private void _extractDocumentFiles()
164 { URL homeUrl = this.fileSet.getFile(0).getLocation();
165
166 NodeList metadata = this.domDocument.getElementsByTagName("img");
167 for (int n = 0; n < metadata.getLength(); n ++) {
168 Node node = metadata.item(n);
169 Element element = (Element) node;
170
171 String location = element.getAttribute("src");
172 if (location == null || location.length() == 0) {
173 System.out.println("No name");
174 continue;
175 }
176 try
177 { // make the url for the image, and then add it to the document list of
178 //
179 URL imgUrl = new URL(homeUrl, location);
180 METSFile file = this.fileSet.addFile(imgUrl);
181 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
182 }
183 catch (MalformedURLException ex)
184 { // TODO: report exception/failure to resolve...
185 }
186 }
187
188
189 /**
190 HTMLBlock codedContent = htmlDoc.getCodedContent();
191 for (int e = 0; e < codedContent.size(); e ++)
192 { if (codedContent.elementAt(e) instanceof HTMLTag)
193 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
194
195 if (tag.tagName().equals("img"))
196 { String location = tag.idValue("src");
197
198 try
199 { // make the url for the image, and then add it to the document list of
200 //
201 URL imgUrl = new URL(homeUrl, location);
202 METSFile file = this.fileSet.addFile(imgUrl);
203 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
204 }
205 catch (MalformedURLException ex)
206 { // TODO: report exception/failure to resolve...
207 }
208 }
209 }
210 }
211 */
212 }
213
214 public String getDocumentType()
215 { return HTML_DOCUMENT_TYPE;
216 }
217
218 // I think this is used for single section documents, while getSectionText
219 // is used for sectioned documents
220 // we will use the domDocument rather than reading it in again to another HTMLDoc.
221 public String getDocumentText()
222 {
223 XMLConverter converter = new XMLConverter();
224 return converter.getPrettyString(this.domDocument.getDocumentElement());
225 /* HTMLDoc htmlDoc;
226 URL url =(URL) this.fileSet.getFile(0).getLocation();
227
228 this.getSectionText("1");
229
230 if (url.getProtocol().equals("file"))
231 { htmlDoc = new HTMLDoc(url, url.getPath());
232 }
233 else
234 { htmlDoc = new HTMLDoc(url);
235 }
236 return htmlDoc.getContent();
237 */
238 }
239
240 private void resolveImages() {
241
242 // find the path of the url relative to the collection
243 URL full_path = this.fileSet.getFile(0).getLocation();
244
245 String base_url;
246 if (full_path.getProtocol().equals("file")) {
247 base_url = full_path.getPath();
248 int import_pos = base_url.indexOf("import");
249 base_url = base_url.substring(import_pos);
250 base_url = "_httpcollection_/"+base_url;
251 } else {
252 base_url = full_path.toString();
253 }
254
255 // need to take off the last part
256 base_url = base_url.substring(0, base_url.lastIndexOf("/")+1);
257
258 NodeList metadata = this.domDocument.getElementsByTagName("img");
259 for (int n = 0; n < metadata.getLength(); n ++) {
260 Node node = metadata.item(n);
261 Element element = (Element) node;
262
263 String location = element.getAttribute("src");
264 if (location != null && location.length() > 0 && isRelative(location)) {
265 // modify the source url
266 element.setAttribute("src", base_url+location);
267 }
268 }
269 }
270
271 private boolean isRelative(String location) {
272
273 if ( location.startsWith("http:") || location.startsWith("file:")) {
274 return false;
275 }
276 return true;
277 }
278
279 public Document getDOMDocument()
280 {
281 if (this.domDocument == null) {
282 URL url =(URL) this.fileSet.getFile(0).getLocation();
283 this.loadDocument(url);
284 resolveImages();
285 }
286 return this.domDocument;
287 }
288
289 private XPointer getXPointer(METSDivision division)
290 { String fileId = division.getDefaultFileReference();
291
292 if (fileId == null) {
293 System.err.println("Unable to obtain file reference for section " + division.getLabel());
294 return null;
295 }
296
297 METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId);
298 if (fileGroup == null) {
299 System.err.println("Unable to obtain file reference for filegroup " + fileId);
300 return null;
301 }
302
303 METSFile file = fileGroup.getFile(0);
304 if (file == null) {
305 System.err.println("Unable to obtain any files within filegroup " + fileId);
306 return null;
307 }
308
309 URL url = file.getLocation();
310 XPointer xpointer = XPointer.processXPointer(this.domDocument, url);
311
312 return xpointer;
313 }
314
315 private XPointer getXPointer(String sectionId)
316 { if (this.domDocument == null) {
317 URL url =(URL) this.fileSet.getFile(0).getLocation();
318 this.loadDocument(url);
319 resolveImages();
320 }
321
322 METSDivision division =
323 this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE,
324 sectionId);
325 if (division == null) {
326 // System.err.println("Unable to locate section " + sectionId);
327 return null;
328 }
329
330 return this.getXPointer(division);
331 }
332
333 public Node getSectionStartNode(METSDivision division)
334 { XPointer xpointer = this.getXPointer(division);
335 return xpointer.getStartNode();
336 }
337
338 public Node getSectionStartNode(String sectionId)
339 { XPointer xpointer = this.getXPointer(sectionId);
340
341 return xpointer.getStartNode();
342 }
343
344 public String getSectionText(String sectionId)
345 { XPointer xpointer = this.getXPointer(sectionId);
346
347 if (xpointer == null) {
348 return "";
349 }
350
351 // get the XML content of the xpointer...
352 return xpointer.toString();
353 }
354}
Note: See TracBrowser for help on using the repository browser.