Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 5800

Last change on this file since 5800 was 5800, checked in by cs025, 20 years ago
Adding gs3build
Property svn:keywords set to `Author Date Id Revision`
File size: 3.4 KB

Line
1	package org.greenstone.gsdl3.gs3build.doctypes;
2
3	import java.io.File;
4	import java.net.URL;
5	import java.net.MalformedURLException;
6
7	import java.util.List;
8	import java.util.Map;
9	import java.util.ArrayList;
10
11	import org.greenstone.gsdl3.gs3build.metadata.METSFile;
12	import org.greenstone.gsdl3.gs3build.util.*;
13
14	public class HTMLDocument extends AbstractDocument
15	{
16	public static final String HTML_DOCUMENT_TYPE = "HTML";
17
18	/**
19	* Create the HTMLDocument from a given URL - the URL may in fact be a reference
20	* to a local file.
21	*
22	* @param <code>URL</code> The location from which to load the file
23	*/
24	public HTMLDocument(URL url)
25	{ super(url);
26
27	HTMLDoc htmlDoc;
28	if (url.toString().startsWith("file://"))
29	{ htmlDoc = new HTMLDoc(url, url.toString().substring(7));
30	}
31	else
32	{ htmlDoc = new HTMLDoc(url);
33	}
34
35	this._extractDocumentFiles(htmlDoc);
36	this._extractDocumentMetadata(htmlDoc);
37	}
38
39	private void _extractDocumentMetadata(HTMLDoc htmlDoc)
40	{ HTMLBlock codedContent = htmlDoc.getCodedContent();
41	boolean inTitle = false;
42	StringBuffer title = new StringBuffer();
43
44	for (int e = 0; e < codedContent.size(); e ++)
45	{ if (codedContent.elementAt(e) instanceof HTMLTag)
46	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
47
48	if (tag.tagName().equals("meta"))
49	{ // check that the name of the metadata item exists
50	String name = tag.idValue("name");
51	if (name == null \|\| name.length() == 0) {
52	continue;
53	}
54
55	// get the value, if it exists
56	String value = tag.idValue("content");
57	if (value != null && value.length() > 0) {
58	System.out.println(" " + value);
59	}
60	// if value does not exist, default it to being the same
61	// as the name.
62	else {
63	value = name;
64	}
65
66	this.addDocumentMetadata(name, value);
67	}
68	else if (tag.tagName().equals("title"))
69	{ inTitle = true;
70	}
71	else if (tag.tagName().equals("/title"))
72	{ inTitle = false;
73	}
74	// cut off when real body content appears - not a perfect
75	// implementation, just cheap & cheerful
76	else if (tag.tagName().equals("/head"))
77	{ break;
78	}
79	else if (tag.tagName().equals("body"))
80	{ break;
81	}
82	}
83	else if (inTitle == true)
84	{ title.append(codedContent.elementAt(e).toString());
85	}
86	}
87	if (title.length() > 0)
88	{ this.addDocumentMetadata("title", title.toString());
89	}
90	}
91
92	private void _extractDocumentFiles(HTMLDoc htmlDoc)
93	{ URL homeUrl = this.fileSet.getFile(0).getLocation();
94
95	HTMLBlock codedContent = htmlDoc.getCodedContent();
96	for (int e = 0; e < codedContent.size(); e ++)
97	{ if (codedContent.elementAt(e) instanceof HTMLTag)
98	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
99
100	if (tag.tagName().equals("img"))
101	{ String location = tag.idValue("src");
102
103	try
104	{ // make the url for the image, and then add it to the document list of
105	//
106	URL imgUrl = new URL(homeUrl, location);
107	METSFile file = this.fileSet.addFile(imgUrl);
108	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
109	}
110	catch (MalformedURLException ex)
111	{ // TODO: report exception/failure to resolve...
112	}
113	}
114	}
115	}
116	}
117
118	public String getDocumentType()
119	{ return HTML_DOCUMENT_TYPE;
120	}
121
122	public String getDocumentText()
123	{ HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7));
124	return htmlDoc.getContent();
125	}
126	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: