source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 5800

Last change on this file since 5800 was 5800, checked in by cs025, 20 years ago

Adding gs3build

  • Property svn:keywords set to Author Date Id Revision
File size: 3.4 KB
Line 
1package org.greenstone.gsdl3.gs3build.doctypes;
2
3import java.io.File;
4import java.net.URL;
5import java.net.MalformedURLException;
6
7import java.util.List;
8import java.util.Map;
9import java.util.ArrayList;
10
11import org.greenstone.gsdl3.gs3build.metadata.METSFile;
12import org.greenstone.gsdl3.gs3build.util.*;
13
14public class HTMLDocument extends AbstractDocument
15{
16 public static final String HTML_DOCUMENT_TYPE = "HTML";
17
18 /**
19 * Create the HTMLDocument from a given URL - the URL may in fact be a reference
20 * to a local file.
21 *
22 * @param <code>URL</code> The location from which to load the file
23 */
24 public HTMLDocument(URL url)
25 { super(url);
26
27 HTMLDoc htmlDoc;
28 if (url.toString().startsWith("file://"))
29 { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
30 }
31 else
32 { htmlDoc = new HTMLDoc(url);
33 }
34
35 this._extractDocumentFiles(htmlDoc);
36 this._extractDocumentMetadata(htmlDoc);
37 }
38
39 private void _extractDocumentMetadata(HTMLDoc htmlDoc)
40 { HTMLBlock codedContent = htmlDoc.getCodedContent();
41 boolean inTitle = false;
42 StringBuffer title = new StringBuffer();
43
44 for (int e = 0; e < codedContent.size(); e ++)
45 { if (codedContent.elementAt(e) instanceof HTMLTag)
46 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
47
48 if (tag.tagName().equals("meta"))
49 { // check that the name of the metadata item exists
50 String name = tag.idValue("name");
51 if (name == null || name.length() == 0) {
52 continue;
53 }
54
55 // get the value, if it exists
56 String value = tag.idValue("content");
57 if (value != null && value.length() > 0) {
58 System.out.println(" " + value);
59 }
60 // if value does not exist, default it to being the same
61 // as the name.
62 else {
63 value = name;
64 }
65
66 this.addDocumentMetadata(name, value);
67 }
68 else if (tag.tagName().equals("title"))
69 { inTitle = true;
70 }
71 else if (tag.tagName().equals("/title"))
72 { inTitle = false;
73 }
74 // cut off when real body content appears - not a perfect
75 // implementation, just cheap & cheerful
76 else if (tag.tagName().equals("/head"))
77 { break;
78 }
79 else if (tag.tagName().equals("body"))
80 { break;
81 }
82 }
83 else if (inTitle == true)
84 { title.append(codedContent.elementAt(e).toString());
85 }
86 }
87 if (title.length() > 0)
88 { this.addDocumentMetadata("title", title.toString());
89 }
90 }
91
92 private void _extractDocumentFiles(HTMLDoc htmlDoc)
93 { URL homeUrl = this.fileSet.getFile(0).getLocation();
94
95 HTMLBlock codedContent = htmlDoc.getCodedContent();
96 for (int e = 0; e < codedContent.size(); e ++)
97 { if (codedContent.elementAt(e) instanceof HTMLTag)
98 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
99
100 if (tag.tagName().equals("img"))
101 { String location = tag.idValue("src");
102
103 try
104 { // make the url for the image, and then add it to the document list of
105 //
106 URL imgUrl = new URL(homeUrl, location);
107 METSFile file = this.fileSet.addFile(imgUrl);
108 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
109 }
110 catch (MalformedURLException ex)
111 { // TODO: report exception/failure to resolve...
112 }
113 }
114 }
115 }
116 }
117
118 public String getDocumentType()
119 { return HTML_DOCUMENT_TYPE;
120 }
121
122 public String getDocumentText()
123 { HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7));
124 return htmlDoc.getContent();
125 }
126}
Note: See TracBrowser for help on using the repository browser.