Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: tags/ant-install-branch-merged-1/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 9873

Last change on this file since 9873 was 9873, checked in by (none), 19 years ago
This commit was manufactured by cvs2svn to create tag 'ant-install-branch-merged-1'.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.8 KB

Line
1	package org.greenstone.gsdl3.gs3build.doctypes;
2
3	import java.io.File;
4	import java.net.URL;
5	import java.net.MalformedURLException;
6
7	import java.util.Iterator;
8	import java.util.List;
9	import java.util.Map;
10	import java.util.ArrayList;
11
12	import org.greenstone.gsdl3.gs3build.metadata.*;
13	import org.greenstone.gsdl3.gs3build.util.*;
14	import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
15	import org.greenstone.gsdl3.util.XMLConverter;
16
17	import org.w3c.dom.*;
18
19	public class HTMLDocument extends AbstractDocument
20	{
21	public static final String HTML_DOCUMENT_TYPE = "HTML";
22	Document domDocument;
23
24	public HTMLDocument(DocumentID id)
25	{ super(id);
26	this.domDocument = null;
27	}
28
29	/**
30	* Create the HTMLDocument from a given URL - the URL may in fact be a reference
31	* to a local file.
32	*
33	* @param <code>URL</code> The location from which to load the file
34	*/
35	public HTMLDocument(URL url)
36	{ super(url);
37
38	this.loadDocument(url);
39
40	this._extractDocumentFiles();
41	this._extractDocumentMetadata();
42	// extract the files before resolving the images
43	resolveImages();
44	HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument);
45	docTools.setMetsDocument(this);
46	docTools.setUrl(this.fileSet.getFile(0).getLocation());
47	docTools.findSections();
48	METSStructure sectionStruct = docTools.getStructure();
49
50	if (sectionStruct.size() > 0) {
51	METSStructureSet structureSet = this.getDocumentStructure();
52	structureSet.addStructure(sectionStruct);
53	}
54	}
55
56	private void loadDocument(URL url)
57	{
58	// HTMLDoc htmlDoc;
59
60	HTMLTidy tidyDoc;
61	if (url.getProtocol().equals("file"))
62	{ tidyDoc = new HTMLTidy(new File(url.getPath()));
63	}
64	else
65	{ //htmlDoc = new HTMLDoc(url);
66	tidyDoc = new HTMLTidy(url);
67	}
68
69	// System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().totalMemory());
70
71	this.domDocument = tidyDoc.getDocument();
72	Runtime.getRuntime().gc();
73	// System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().totalMemory());
74	}
75
76	private void _extractDocumentMetadata()
77	{
78	NodeList metadata = this.domDocument.getElementsByTagName("META");
79	for (int n = 0; n < metadata.getLength(); n ++) {
80	Node node = metadata.item(n);
81	Element element = (Element) node;
82
83	String name = element.getAttribute("name");
84	if (name == null \|\| name.length() == 0) {
85	continue;
86	}
87
88	String value = element.getAttribute("content");
89	if (value == null \|\| value.length() == 0) {
90	value = name;
91	}
92
93	this.addDocumentMetadata(new MetadataLabel(name), value);
94	}
95
96	NodeList titles = this.domDocument.getElementsByTagName("TITLE");
97	StringBuffer title = new StringBuffer();
98	for (int n = 0; n < titles.getLength(); n ++) {
99	Node node = titles.item(n);
100	Element element = (Element) node;
101
102	NodeList childNodes = node.getChildNodes();
103	for (int c = 0; c < childNodes.getLength(); c ++) {
104	Node child = childNodes.item(c);
105	if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
106	title.append(child.getNodeValue());
107	}
108	}
109	}
110
111	if (title.length() > 0)
112	{ this.addDocumentMetadata(new MetadataLabel("Title"), title.toString());
113	}
114
115	/* Old HTMLDoc based parsing...
116	HTMLBlock codedContent = htmlDoc.getCodedContent();
117	boolean inTitle = false;
118	title = new StringBuffer();
119
120	for (int e = 0; e < codedContent.size(); e ++)
121	{ if (codedContent.elementAt(e) instanceof HTMLTag)
122	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
123
124	if (tag.tagName().equals("meta"))
125	{ // check that the name of the metadata item exists
126	String name = tag.idValue("name");
127	if (name == null \|\| name.length() == 0) {
128	continue;
129	}
130
131	// get the value, if it exists
132	String value = tag.idValue("content");
133
134	// if value does not exist, default it to being the same
135	// as the name.
136	if (value == null \|\| value.length() > 0) {
137	value = name;
138	}
139	}
140	else
141	if (tag.tagName().equals("title"))
142	{ inTitle = true;
143	}
144	else if (tag.tagName().equals("/title"))
145	{ inTitle = false;
146	}
147	// cut off when real body content appears - not a perfect
148	// implementation, just cheap & cheerful
149	else if (tag.tagName().equals("/head"))
150	{ break;
151	}
152	else if (tag.tagName().equals("body"))
153	{ break;
154	}
155	}
156	else if (inTitle == true)
157	{ title.append(codedContent.elementAt(e).toString());
158	}
159	}
160	*/
161	}
162
163	private void _extractDocumentFiles()
164	{ URL homeUrl = this.fileSet.getFile(0).getLocation();
165
166	NodeList metadata = this.domDocument.getElementsByTagName("img");
167	for (int n = 0; n < metadata.getLength(); n ++) {
168	Node node = metadata.item(n);
169	Element element = (Element) node;
170
171	String location = element.getAttribute("src");
172	if (location == null \|\| location.length() == 0) {
173	System.out.println("No name");
174	continue;
175	}
176	try
177	{ // make the url for the image, and then add it to the document list of
178	//
179	URL imgUrl = new URL(homeUrl, location);
180	METSFile file = this.fileSet.addFile(imgUrl);
181	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
182	}
183	catch (MalformedURLException ex)
184	{ // TODO: report exception/failure to resolve...
185	}
186	}
187
188
189	/**
190	HTMLBlock codedContent = htmlDoc.getCodedContent();
191	for (int e = 0; e < codedContent.size(); e ++)
192	{ if (codedContent.elementAt(e) instanceof HTMLTag)
193	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
194
195	if (tag.tagName().equals("img"))
196	{ String location = tag.idValue("src");
197
198	try
199	{ // make the url for the image, and then add it to the document list of
200	//
201	URL imgUrl = new URL(homeUrl, location);
202	METSFile file = this.fileSet.addFile(imgUrl);
203	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
204	}
205	catch (MalformedURLException ex)
206	{ // TODO: report exception/failure to resolve...
207	}
208	}
209	}
210	}
211	*/
212	}
213
214	public String getDocumentType()
215	{ return HTML_DOCUMENT_TYPE;
216	}
217
218	// I think this is used for single section documents, while getSectionText
219	// is used for sectioned documents
220	// we will use the domDocument rather than reading it in again to another HTMLDoc.
221	public String getDocumentText()
222	{
223	XMLConverter converter = new XMLConverter();
224	return converter.getPrettyString(this.domDocument.getDocumentElement());
225	/* HTMLDoc htmlDoc;
226	URL url =(URL) this.fileSet.getFile(0).getLocation();
227
228	this.getSectionText("1");
229
230	if (url.getProtocol().equals("file"))
231	{ htmlDoc = new HTMLDoc(url, url.getPath());
232	}
233	else
234	{ htmlDoc = new HTMLDoc(url);
235	}
236	return htmlDoc.getContent();
237	*/
238	}
239
240	private void resolveImages() {
241
242	// find the path of the url relative to the collection
243	URL full_path = this.fileSet.getFile(0).getLocation();
244
245	String base_url;
246	if (full_path.getProtocol().equals("file")) {
247	base_url = full_path.getPath();
248	int import_pos = base_url.indexOf("import");
249	base_url = base_url.substring(import_pos);
250	base_url = "_httpcollection_/"+base_url;
251	} else {
252	base_url = full_path.toString();
253	}
254
255	// need to take off the last part
256	base_url = base_url.substring(0, base_url.lastIndexOf("/")+1);
257
258	NodeList metadata = this.domDocument.getElementsByTagName("img");
259	for (int n = 0; n < metadata.getLength(); n ++) {
260	Node node = metadata.item(n);
261	Element element = (Element) node;
262
263	String location = element.getAttribute("src");
264	if (location != null && location.length() > 0 && isRelative(location)) {
265	// modify the source url
266	element.setAttribute("src", base_url+location);
267	}
268	}
269	}
270
271	private boolean isRelative(String location) {
272
273	if ( location.startsWith("http:") \|\| location.startsWith("file:")) {
274	return false;
275	}
276	return true;
277	}
278
279	public Document getDOMDocument()
280	{
281	if (this.domDocument == null) {
282	URL url =(URL) this.fileSet.getFile(0).getLocation();
283	this.loadDocument(url);
284	resolveImages();
285	}
286	return this.domDocument;
287	}
288
289	private XPointer getXPointer(METSDivision division)
290	{ String fileId = division.getDefaultFileReference();
291
292	if (fileId == null) {
293	System.err.println("Unable to obtain file reference for section " + division.getLabel());
294	return null;
295	}
296
297	METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId);
298	if (fileGroup == null) {
299	System.err.println("Unable to obtain file reference for filegroup " + fileId);
300	return null;
301	}
302
303	METSFile file = fileGroup.getFile(0);
304	if (file == null) {
305	System.err.println("Unable to obtain any files within filegroup " + fileId);
306	return null;
307	}
308
309	URL url = file.getLocation();
310	XPointer xpointer = XPointer.processXPointer(this.domDocument, url);
311
312	return xpointer;
313	}
314
315	private XPointer getXPointer(String sectionId)
316	{ if (this.domDocument == null) {
317	URL url =(URL) this.fileSet.getFile(0).getLocation();
318	this.loadDocument(url);
319	resolveImages();
320	}
321
322	METSDivision division =
323	this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE,
324	sectionId);
325	if (division == null) {
326	// System.err.println("Unable to locate section " + sectionId);
327	return null;
328	}
329
330	return this.getXPointer(division);
331	}
332
333	public Node getSectionStartNode(METSDivision division)
334	{ XPointer xpointer = this.getXPointer(division);
335	return xpointer.getStartNode();
336	}
337
338	public Node getSectionStartNode(String sectionId)
339	{ XPointer xpointer = this.getXPointer(sectionId);
340
341	return xpointer.getStartNode();
342	}
343
344	public String getSectionText(String sectionId)
345	{ XPointer xpointer = this.getXPointer(sectionId);
346
347	if (xpointer == null) {
348	return "";
349	}
350
351	// get the XML content of the xpointer...
352	return xpointer.toString();
353	}
354	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: