Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 6284

Last change on this file since 6284 was 6284, checked in by cs025, 20 years ago
Added HTMLDocumentTools, also modifications to the abstract interfaces and the HTMLDocument doctype to support indexing by section.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Line
1	package org.greenstone.gsdl3.gs3build.doctypes;
2
3	import java.io.File;
4	import java.net.URL;
5	import java.net.MalformedURLException;
6
7	import java.util.List;
8	import java.util.Map;
9	import java.util.ArrayList;
10
11	import org.greenstone.gsdl3.gs3build.metadata.*;
12	import org.greenstone.gsdl3.gs3build.util.*;
13	import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
14
15	import org.w3c.dom.*;
16
17	public class HTMLDocument extends AbstractDocument
18	{
19	public static final String HTML_DOCUMENT_TYPE = "HTML";
20	Document domDocument;
21
22	public HTMLDocument(DocumentID id)
23	{ super(id);
24	this.domDocument = null;
25	}
26
27	/**
28	* Create the HTMLDocument from a given URL - the URL may in fact be a reference
29	* to a local file.
30	*
31	* @param <code>URL</code> The location from which to load the file
32	*/
33	public HTMLDocument(URL url)
34	{ super(url);
35
36	this.loadDocument(url);
37
38	this._extractDocumentFiles();
39	this._extractDocumentMetadata();
40
41	HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument);
42	docTools.setMetsDocument(this);
43	docTools.setUrl(this.fileSet.getFile(0).getLocation());
44	docTools.findSections();
45	METSStructure sectionStruct = docTools.getStructure();
46
47	if (sectionStruct.size() > 0) {
48	METSStructureSet structureSet = this.getDocumentStructure();
49	structureSet.addStructure(sectionStruct);
50	}
51	}
52
53	private void loadDocument(URL url)
54	{
55	// HTMLDoc htmlDoc;
56	HTMLTidy tidyDoc;
57	if (url.toString().startsWith("file://"))
58	{ //htmlDoc = new HTMLDoc(url, url.toString().substring(7));
59	tidyDoc = new HTMLTidy(new File(url.toString().substring(7)));
60	}
61	else if (url.toString().startsWith("file:/"))
62	{ //htmlDoc = new HTMLDoc(url, url.toString().substring(5));
63	tidyDoc = new HTMLTidy(new File( url.toString().substring(5)));
64	}
65	else
66	{ //htmlDoc = new HTMLDoc(url);
67	tidyDoc = new HTMLTidy(url);
68	}
69
70	this.domDocument = tidyDoc.getDocument();
71	}
72
73	private void _extractDocumentMetadata()
74	{
75	NodeList metadata = this.domDocument.getElementsByTagName("META");
76	for (int n = 0; n < metadata.getLength(); n ++) {
77	Node node = metadata.item(n);
78	Element element = (Element) node;
79
80	String name = element.getAttribute("name");
81	if (name == null \|\| name.length() == 0) {
82	continue;
83	}
84
85	String value = element.getAttribute("content");
86	if (value == null \|\| value.length() == 0) {
87	value = name;
88	}
89
90	this.addDocumentMetadata(new MetadataLabel(name), value);
91	}
92
93	NodeList titles = this.domDocument.getElementsByTagName("TITLE");
94	StringBuffer title = new StringBuffer();
95	for (int n = 0; n < titles.getLength(); n ++) {
96	Node node = titles.item(n);
97	Element element = (Element) node;
98
99	NodeList childNodes = node.getChildNodes();
100	for (int c = 0; c < childNodes.getLength(); c ++) {
101	Node child = childNodes.item(c);
102	if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
103	title.append(child.getNodeValue());
104	}
105	}
106	}
107
108	if (title.length() > 0)
109	{ this.addDocumentMetadata(new MetadataLabel("title"), title.toString());
110	}
111
112	/* Old HTMLDoc based parsing...
113	HTMLBlock codedContent = htmlDoc.getCodedContent();
114	boolean inTitle = false;
115	title = new StringBuffer();
116
117	for (int e = 0; e < codedContent.size(); e ++)
118	{ if (codedContent.elementAt(e) instanceof HTMLTag)
119	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
120
121	if (tag.tagName().equals("meta"))
122	{ // check that the name of the metadata item exists
123	String name = tag.idValue("name");
124	if (name == null \|\| name.length() == 0) {
125	continue;
126	}
127
128	// get the value, if it exists
129	String value = tag.idValue("content");
130
131	// if value does not exist, default it to being the same
132	// as the name.
133	if (value == null \|\| value.length() > 0) {
134	value = name;
135	}
136	}
137	else
138	if (tag.tagName().equals("title"))
139	{ inTitle = true;
140	}
141	else if (tag.tagName().equals("/title"))
142	{ inTitle = false;
143	}
144	// cut off when real body content appears - not a perfect
145	// implementation, just cheap & cheerful
146	else if (tag.tagName().equals("/head"))
147	{ break;
148	}
149	else if (tag.tagName().equals("body"))
150	{ break;
151	}
152	}
153	else if (inTitle == true)
154	{ title.append(codedContent.elementAt(e).toString());
155	}
156	}
157	*/
158	}
159
160	private void _extractDocumentFiles()
161	{ URL homeUrl = this.fileSet.getFile(0).getLocation();
162
163	NodeList metadata = this.domDocument.getElementsByTagName("img");
164	for (int n = 0; n < metadata.getLength(); n ++) {
165	Node node = metadata.item(n);
166	Element element = (Element) node;
167
168	String location = element.getAttribute("src");
169	if (location == null \|\| location.length() == 0) {
170	System.out.println("No name");
171	continue;
172	}
173
174	try
175	{ // make the url for the image, and then add it to the document list of
176	//
177	URL imgUrl = new URL(homeUrl, location);
178	METSFile file = this.fileSet.addFile(imgUrl);
179	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
180	}
181	catch (MalformedURLException ex)
182	{ // TODO: report exception/failure to resolve...
183	}
184	}
185
186	/**
187	HTMLBlock codedContent = htmlDoc.getCodedContent();
188	for (int e = 0; e < codedContent.size(); e ++)
189	{ if (codedContent.elementAt(e) instanceof HTMLTag)
190	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
191
192	if (tag.tagName().equals("img"))
193	{ String location = tag.idValue("src");
194
195	try
196	{ // make the url for the image, and then add it to the document list of
197	//
198	URL imgUrl = new URL(homeUrl, location);
199	METSFile file = this.fileSet.addFile(imgUrl);
200	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
201	}
202	catch (MalformedURLException ex)
203	{ // TODO: report exception/failure to resolve...
204	}
205	}
206	}
207	}
208	*/
209	}
210
211	public String getDocumentType()
212	{ return HTML_DOCUMENT_TYPE;
213	}
214
215	public String getDocumentText()
216	{
217	HTMLDoc htmlDoc;
218	URL url =(URL) this.fileSet.getFile(0).getLocation();
219
220	this.getSectionText("1");
221
222	if (url.toString().startsWith("file://"))
223	{ htmlDoc = new HTMLDoc(url, url.toString().substring(7));
224	}
225	else if (url.toString().startsWith("file:/"))
226	{ htmlDoc = new HTMLDoc(url, url.toString().substring(5));
227	}
228	else
229	{ htmlDoc = new HTMLDoc(url);
230	}
231	return htmlDoc.getContent();
232	}
233
234	public Document getDOMDocument()
235	{
236	if (this.domDocument == null) {
237	URL url =(URL) this.fileSet.getFile(0).getLocation();
238	this.loadDocument(url);
239	}
240	return this.domDocument;
241	}
242
243	private XPointer getXPointer(METSDivision division)
244	{ String fileId = division.getDefaultFileReference();
245
246	if (fileId == null) {
247	System.err.println("Unable to obtain file reference for section " + division.getLabel());
248	return null;
249	}
250
251	METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId);
252	if (fileGroup == null) {
253	System.err.println("Unable to obtain file reference for filegroup " + fileId);
254	return null;
255	}
256
257	METSFile file = fileGroup.getFile(0);
258	if (file == null) {
259	System.err.println("Unable to obtain any files within filegroup " + fileId);
260	return null;
261	}
262
263	URL url = file.getLocation();
264	XPointer xpointer = XPointer.processXPointer(this.domDocument, url);
265
266	return xpointer;
267	}
268
269	private XPointer getXPointer(String sectionId)
270	{ if (this.domDocument == null) {
271	URL url =(URL) this.fileSet.getFile(0).getLocation();
272	this.loadDocument(url);
273	}
274
275	METSDivision division =
276	this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE,
277	sectionId);
278	if (division == null) {
279	// System.err.println("Unable to locate section " + sectionId);
280	return null;
281	}
282
283	return this.getXPointer(division);
284	}
285
286	public Node getSectionStartNode(METSDivision division)
287	{ XPointer xpointer = this.getXPointer(division);
288	return xpointer.getStartNode();
289	}
290
291	public Node getSectionStartNode(String sectionId)
292	{ XPointer xpointer = this.getXPointer(sectionId);
293
294	return xpointer.getStartNode();
295	}
296
297	public String getSectionText(String sectionId)
298	{ XPointer xpointer = this.getXPointer(sectionId);
299
300	if (xpointer == null) {
301	return "";
302	}
303
304	// get the XML content of the xpointer...
305	return xpointer.toString();
306	}
307	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: