1 | package org.greenstone.gsdl3.gs3build.doctypes;
|
---|
2 |
|
---|
3 | import java.io.File;
|
---|
4 | import java.net.URL;
|
---|
5 | import java.net.MalformedURLException;
|
---|
6 |
|
---|
7 | import java.util.List;
|
---|
8 | import java.util.Map;
|
---|
9 | import java.util.ArrayList;
|
---|
10 |
|
---|
11 | import org.greenstone.gsdl3.gs3build.metadata.METSFile;
|
---|
12 | import org.greenstone.gsdl3.gs3build.util.*;
|
---|
13 |
|
---|
14 | public class HTMLDocument extends AbstractDocument
|
---|
15 | {
|
---|
16 | public static final String HTML_DOCUMENT_TYPE = "HTML";
|
---|
17 |
|
---|
18 | public HTMLDocument(DocumentID id)
|
---|
19 | { super(id);
|
---|
20 | }
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * Create the HTMLDocument from a given URL - the URL may in fact be a reference
|
---|
24 | * to a local file.
|
---|
25 | *
|
---|
26 | * @param <code>URL</code> The location from which to load the file
|
---|
27 | */
|
---|
28 | public HTMLDocument(URL url)
|
---|
29 | { super(url);
|
---|
30 |
|
---|
31 | HTMLDoc htmlDoc;
|
---|
32 | if (url.toString().startsWith("file://"))
|
---|
33 | { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
|
---|
34 | }
|
---|
35 | else if (url.toString().startsWith("file:/"))
|
---|
36 | { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
|
---|
37 | }
|
---|
38 | else
|
---|
39 | { htmlDoc = new HTMLDoc(url);
|
---|
40 | }
|
---|
41 |
|
---|
42 | this._extractDocumentFiles(htmlDoc);
|
---|
43 | this._extractDocumentMetadata(htmlDoc);
|
---|
44 | }
|
---|
45 |
|
---|
46 | private void _extractDocumentMetadata(HTMLDoc htmlDoc)
|
---|
47 | { HTMLBlock codedContent = htmlDoc.getCodedContent();
|
---|
48 | boolean inTitle = false;
|
---|
49 | StringBuffer title = new StringBuffer();
|
---|
50 |
|
---|
51 | for (int e = 0; e < codedContent.size(); e ++)
|
---|
52 | { if (codedContent.elementAt(e) instanceof HTMLTag)
|
---|
53 | { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
|
---|
54 |
|
---|
55 | if (tag.tagName().equals("meta"))
|
---|
56 | { // check that the name of the metadata item exists
|
---|
57 | String name = tag.idValue("name");
|
---|
58 | if (name == null || name.length() == 0) {
|
---|
59 | continue;
|
---|
60 | }
|
---|
61 |
|
---|
62 | // get the value, if it exists
|
---|
63 | String value = tag.idValue("content");
|
---|
64 | if (value != null && value.length() > 0) {
|
---|
65 | System.out.println(" " + value);
|
---|
66 | }
|
---|
67 | // if value does not exist, default it to being the same
|
---|
68 | // as the name.
|
---|
69 | else {
|
---|
70 | value = name;
|
---|
71 | }
|
---|
72 |
|
---|
73 | this.addDocumentMetadata(name, value);
|
---|
74 | }
|
---|
75 | else if (tag.tagName().equals("title"))
|
---|
76 | { inTitle = true;
|
---|
77 | }
|
---|
78 | else if (tag.tagName().equals("/title"))
|
---|
79 | { inTitle = false;
|
---|
80 | }
|
---|
81 | // cut off when real body content appears - not a perfect
|
---|
82 | // implementation, just cheap & cheerful
|
---|
83 | else if (tag.tagName().equals("/head"))
|
---|
84 | { break;
|
---|
85 | }
|
---|
86 | else if (tag.tagName().equals("body"))
|
---|
87 | { break;
|
---|
88 | }
|
---|
89 | }
|
---|
90 | else if (inTitle == true)
|
---|
91 | { title.append(codedContent.elementAt(e).toString());
|
---|
92 | }
|
---|
93 | }
|
---|
94 | if (title.length() > 0)
|
---|
95 | { this.addDocumentMetadata("title", title.toString());
|
---|
96 | }
|
---|
97 | }
|
---|
98 |
|
---|
99 | private void _extractDocumentFiles(HTMLDoc htmlDoc)
|
---|
100 | { URL homeUrl = this.fileSet.getFile(0).getLocation();
|
---|
101 |
|
---|
102 | HTMLBlock codedContent = htmlDoc.getCodedContent();
|
---|
103 | for (int e = 0; e < codedContent.size(); e ++)
|
---|
104 | { if (codedContent.elementAt(e) instanceof HTMLTag)
|
---|
105 | { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
|
---|
106 |
|
---|
107 | if (tag.tagName().equals("img"))
|
---|
108 | { String location = tag.idValue("src");
|
---|
109 |
|
---|
110 | try
|
---|
111 | { // make the url for the image, and then add it to the document list of
|
---|
112 | //
|
---|
113 | URL imgUrl = new URL(homeUrl, location);
|
---|
114 | METSFile file = this.fileSet.addFile(imgUrl);
|
---|
115 | this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
|
---|
116 | }
|
---|
117 | catch (MalformedURLException ex)
|
---|
118 | { // TODO: report exception/failure to resolve...
|
---|
119 | }
|
---|
120 | }
|
---|
121 | }
|
---|
122 | }
|
---|
123 | }
|
---|
124 |
|
---|
125 | public String getDocumentType()
|
---|
126 | { return HTML_DOCUMENT_TYPE;
|
---|
127 | }
|
---|
128 |
|
---|
129 | public String getDocumentText()
|
---|
130 | {
|
---|
131 | HTMLDoc htmlDoc;
|
---|
132 | URL url =(URL) this.fileSet.getFile(0).getLocation();
|
---|
133 |
|
---|
134 | if (url.toString().startsWith("file://"))
|
---|
135 | { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
|
---|
136 | }
|
---|
137 | else if (url.toString().startsWith("file:/"))
|
---|
138 | { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
|
---|
139 | }
|
---|
140 | else
|
---|
141 | { htmlDoc = new HTMLDoc(url);
|
---|
142 | }
|
---|
143 | return htmlDoc.getContent();
|
---|
144 | }
|
---|
145 |
|
---|
146 | public String getSectionText(String document)
|
---|
147 | { return "";
|
---|
148 | }
|
---|
149 | }
|
---|