source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java@ 6101

Last change on this file since 6101 was 6101, checked in by cs025, 20 years ago

Added getSectionText member function to documents

  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1package org.greenstone.gsdl3.gs3build.doctypes;
2
3import java.io.File;
4import java.net.URL;
5import java.net.MalformedURLException;
6
7import java.util.List;
8import java.util.Map;
9import java.util.ArrayList;
10
11import org.greenstone.gsdl3.gs3build.metadata.METSFile;
12import org.greenstone.gsdl3.gs3build.util.*;
13
14public class HTMLDocument extends AbstractDocument
15{
16 public static final String HTML_DOCUMENT_TYPE = "HTML";
17
18 public HTMLDocument(DocumentID id)
19 { super(id);
20 }
21
22 /**
23 * Create the HTMLDocument from a given URL - the URL may in fact be a reference
24 * to a local file.
25 *
26 * @param <code>URL</code> The location from which to load the file
27 */
28 public HTMLDocument(URL url)
29 { super(url);
30
31 HTMLDoc htmlDoc;
32 if (url.toString().startsWith("file://"))
33 { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
34 }
35 else if (url.toString().startsWith("file:/"))
36 { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
37 }
38 else
39 { htmlDoc = new HTMLDoc(url);
40 }
41
42 this._extractDocumentFiles(htmlDoc);
43 this._extractDocumentMetadata(htmlDoc);
44 }
45
46 private void _extractDocumentMetadata(HTMLDoc htmlDoc)
47 { HTMLBlock codedContent = htmlDoc.getCodedContent();
48 boolean inTitle = false;
49 StringBuffer title = new StringBuffer();
50
51 for (int e = 0; e < codedContent.size(); e ++)
52 { if (codedContent.elementAt(e) instanceof HTMLTag)
53 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
54
55 if (tag.tagName().equals("meta"))
56 { // check that the name of the metadata item exists
57 String name = tag.idValue("name");
58 if (name == null || name.length() == 0) {
59 continue;
60 }
61
62 // get the value, if it exists
63 String value = tag.idValue("content");
64 if (value != null && value.length() > 0) {
65 System.out.println(" " + value);
66 }
67 // if value does not exist, default it to being the same
68 // as the name.
69 else {
70 value = name;
71 }
72
73 this.addDocumentMetadata(name, value);
74 }
75 else if (tag.tagName().equals("title"))
76 { inTitle = true;
77 }
78 else if (tag.tagName().equals("/title"))
79 { inTitle = false;
80 }
81 // cut off when real body content appears - not a perfect
82 // implementation, just cheap & cheerful
83 else if (tag.tagName().equals("/head"))
84 { break;
85 }
86 else if (tag.tagName().equals("body"))
87 { break;
88 }
89 }
90 else if (inTitle == true)
91 { title.append(codedContent.elementAt(e).toString());
92 }
93 }
94 if (title.length() > 0)
95 { this.addDocumentMetadata("title", title.toString());
96 }
97 }
98
99 private void _extractDocumentFiles(HTMLDoc htmlDoc)
100 { URL homeUrl = this.fileSet.getFile(0).getLocation();
101
102 HTMLBlock codedContent = htmlDoc.getCodedContent();
103 for (int e = 0; e < codedContent.size(); e ++)
104 { if (codedContent.elementAt(e) instanceof HTMLTag)
105 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
106
107 if (tag.tagName().equals("img"))
108 { String location = tag.idValue("src");
109
110 try
111 { // make the url for the image, and then add it to the document list of
112 //
113 URL imgUrl = new URL(homeUrl, location);
114 METSFile file = this.fileSet.addFile(imgUrl);
115 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
116 }
117 catch (MalformedURLException ex)
118 { // TODO: report exception/failure to resolve...
119 }
120 }
121 }
122 }
123 }
124
125 public String getDocumentType()
126 { return HTML_DOCUMENT_TYPE;
127 }
128
129 public String getDocumentText()
130 {
131 HTMLDoc htmlDoc;
132 URL url =(URL) this.fileSet.getFile(0).getLocation();
133
134 if (url.toString().startsWith("file://"))
135 { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
136 }
137 else if (url.toString().startsWith("file:/"))
138 { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
139 }
140 else
141 { htmlDoc = new HTMLDoc(url);
142 }
143 return htmlDoc.getContent();
144 }
145
146 public String getSectionText(String document)
147 { return "";
148 }
149}
Note: See TracBrowser for help on using the repository browser.