Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 6101

Last change on this file since 6101 was 6101, checked in by cs025, 20 years ago
Added getSectionText member function to documents
Property svn:keywords set to `Author Date Id Revision`
File size: 3.9 KB

Line
1	package org.greenstone.gsdl3.gs3build.doctypes;
2
3	import java.io.File;
4	import java.net.URL;
5	import java.net.MalformedURLException;
6
7	import java.util.List;
8	import java.util.Map;
9	import java.util.ArrayList;
10
11	import org.greenstone.gsdl3.gs3build.metadata.METSFile;
12	import org.greenstone.gsdl3.gs3build.util.*;
13
14	public class HTMLDocument extends AbstractDocument
15	{
16	public static final String HTML_DOCUMENT_TYPE = "HTML";
17
18	public HTMLDocument(DocumentID id)
19	{ super(id);
20	}
21
22	/**
23	* Create the HTMLDocument from a given URL - the URL may in fact be a reference
24	* to a local file.
25	*
26	* @param <code>URL</code> The location from which to load the file
27	*/
28	public HTMLDocument(URL url)
29	{ super(url);
30
31	HTMLDoc htmlDoc;
32	if (url.toString().startsWith("file://"))
33	{ htmlDoc = new HTMLDoc(url, url.toString().substring(7));
34	}
35	else if (url.toString().startsWith("file:/"))
36	{ htmlDoc = new HTMLDoc(url, url.toString().substring(5));
37	}
38	else
39	{ htmlDoc = new HTMLDoc(url);
40	}
41
42	this._extractDocumentFiles(htmlDoc);
43	this._extractDocumentMetadata(htmlDoc);
44	}
45
46	private void _extractDocumentMetadata(HTMLDoc htmlDoc)
47	{ HTMLBlock codedContent = htmlDoc.getCodedContent();
48	boolean inTitle = false;
49	StringBuffer title = new StringBuffer();
50
51	for (int e = 0; e < codedContent.size(); e ++)
52	{ if (codedContent.elementAt(e) instanceof HTMLTag)
53	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
54
55	if (tag.tagName().equals("meta"))
56	{ // check that the name of the metadata item exists
57	String name = tag.idValue("name");
58	if (name == null \|\| name.length() == 0) {
59	continue;
60	}
61
62	// get the value, if it exists
63	String value = tag.idValue("content");
64	if (value != null && value.length() > 0) {
65	System.out.println(" " + value);
66	}
67	// if value does not exist, default it to being the same
68	// as the name.
69	else {
70	value = name;
71	}
72
73	this.addDocumentMetadata(name, value);
74	}
75	else if (tag.tagName().equals("title"))
76	{ inTitle = true;
77	}
78	else if (tag.tagName().equals("/title"))
79	{ inTitle = false;
80	}
81	// cut off when real body content appears - not a perfect
82	// implementation, just cheap & cheerful
83	else if (tag.tagName().equals("/head"))
84	{ break;
85	}
86	else if (tag.tagName().equals("body"))
87	{ break;
88	}
89	}
90	else if (inTitle == true)
91	{ title.append(codedContent.elementAt(e).toString());
92	}
93	}
94	if (title.length() > 0)
95	{ this.addDocumentMetadata("title", title.toString());
96	}
97	}
98
99	private void _extractDocumentFiles(HTMLDoc htmlDoc)
100	{ URL homeUrl = this.fileSet.getFile(0).getLocation();
101
102	HTMLBlock codedContent = htmlDoc.getCodedContent();
103	for (int e = 0; e < codedContent.size(); e ++)
104	{ if (codedContent.elementAt(e) instanceof HTMLTag)
105	{ HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
106
107	if (tag.tagName().equals("img"))
108	{ String location = tag.idValue("src");
109
110	try
111	{ // make the url for the image, and then add it to the document list of
112	//
113	URL imgUrl = new URL(homeUrl, location);
114	METSFile file = this.fileSet.addFile(imgUrl);
115	this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
116	}
117	catch (MalformedURLException ex)
118	{ // TODO: report exception/failure to resolve...
119	}
120	}
121	}
122	}
123	}
124
125	public String getDocumentType()
126	{ return HTML_DOCUMENT_TYPE;
127	}
128
129	public String getDocumentText()
130	{
131	HTMLDoc htmlDoc;
132	URL url =(URL) this.fileSet.getFile(0).getLocation();
133
134	if (url.toString().startsWith("file://"))
135	{ htmlDoc = new HTMLDoc(url, url.toString().substring(7));
136	}
137	else if (url.toString().startsWith("file:/"))
138	{ htmlDoc = new HTMLDoc(url, url.toString().substring(5));
139	}
140	else
141	{ htmlDoc = new HTMLDoc(url);
142	}
143	return htmlDoc.getContent();
144	}
145
146	public String getSectionText(String document)
147	{ return "";
148	}
149	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: