1 | package org.greenstone.gsdl3.gs3build.doctypes;
|
---|
2 |
|
---|
3 | import org.w3c.dom.*;
|
---|
4 |
|
---|
5 | import java.net.URL;
|
---|
6 |
|
---|
7 | import java.util.List;
|
---|
8 | import java.util.ArrayList;
|
---|
9 |
|
---|
10 | import org.greenstone.gsdl3.gs3build.metadata.METSStructureSet;
|
---|
11 | import org.greenstone.gsdl3.gs3build.metadata.AbstractStructure;
|
---|
12 | import org.greenstone.gsdl3.gs3build.metadata.METSStructure;
|
---|
13 | import org.greenstone.gsdl3.gs3build.metadata.METSDivision;
|
---|
14 | import org.greenstone.gsdl3.gs3build.metadata.METSDescriptive;
|
---|
15 | import org.greenstone.gsdl3.gs3build.metadata.METSFileSet;
|
---|
16 | import org.greenstone.gsdl3.gs3build.metadata.METSFileGroup;
|
---|
17 | import org.greenstone.gsdl3.gs3build.metadata.METSFile;
|
---|
18 | import org.greenstone.gsdl3.gs3build.metadata.METSFilePos;
|
---|
19 | import org.greenstone.gsdl3.gs3build.metadata.METSFileID;
|
---|
20 | import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
|
---|
21 | import org.greenstone.gsdl3.gs3build.doctypes.AbstractDocument;
|
---|
22 | import org.greenstone.gsdl3.gs3build.util.*;
|
---|
23 |
|
---|
24 | public class HTMLDocumentTools
|
---|
25 | {
|
---|
26 | Document document;
|
---|
27 | URL documentUrl;
|
---|
28 | int sectionLevel;
|
---|
29 | StringBuffer sectionLabel;
|
---|
30 | List sectionSeq;
|
---|
31 | HTMLSection rootSection;
|
---|
32 | HTMLSection currentSection;
|
---|
33 | int sectionSeqNo;
|
---|
34 | METSStructure metsStructure;
|
---|
35 | AbstractStructure currentMets;
|
---|
36 | AbstractDocument metsDocument;
|
---|
37 |
|
---|
38 | /**
|
---|
39 | *
|
---|
40 | */
|
---|
41 | class HTMLSection
|
---|
42 | { String startLocation;
|
---|
43 | String endLocation;
|
---|
44 | String label;
|
---|
45 | List children;
|
---|
46 | HTMLSection parent;
|
---|
47 | int seqNo;
|
---|
48 |
|
---|
49 | public HTMLSection(String label, int seqNo)
|
---|
50 | { this.label = label;
|
---|
51 | this.seqNo = seqNo;
|
---|
52 | this.parent = null;
|
---|
53 | this.children = new ArrayList();
|
---|
54 | }
|
---|
55 |
|
---|
56 | public void setStartLocation(String location)
|
---|
57 | { this.startLocation = location;
|
---|
58 | }
|
---|
59 |
|
---|
60 | public void setEndLocation(String location)
|
---|
61 | { this.endLocation = location;
|
---|
62 | }
|
---|
63 |
|
---|
64 | public void addChild(HTMLSection child)
|
---|
65 | { this.children.add(child);
|
---|
66 | child.parent = this;
|
---|
67 | }
|
---|
68 |
|
---|
69 | public HTMLSection getParent()
|
---|
70 | { return this.parent;
|
---|
71 | }
|
---|
72 |
|
---|
73 | public String getLabel()
|
---|
74 | { return this.label;
|
---|
75 | }
|
---|
76 |
|
---|
77 | public String getXPointer()
|
---|
78 | { StringBuffer xPointerBuffer = new StringBuffer();
|
---|
79 |
|
---|
80 | xPointerBuffer.append("xpointer(");
|
---|
81 | xPointerBuffer.append(this.startLocation);
|
---|
82 | xPointerBuffer.append("/range-to(");
|
---|
83 | xPointerBuffer.append(this.endLocation);
|
---|
84 | xPointerBuffer.append("))");
|
---|
85 |
|
---|
86 | return xPointerBuffer.toString();
|
---|
87 | }
|
---|
88 | }
|
---|
89 |
|
---|
90 | public HTMLDocumentTools(Document document)
|
---|
91 | { this.document = document;
|
---|
92 | this.sectionLevel = 0;
|
---|
93 | this.sectionSeq = new ArrayList();
|
---|
94 | this.sectionSeq.add(new Integer(0));
|
---|
95 | this.sectionLabel = new StringBuffer();
|
---|
96 | this.rootSection = new HTMLSection("", 0);
|
---|
97 | this.currentSection = this.rootSection;
|
---|
98 | this.metsStructure = new METSStructure(METSStructureSet.GSDL3_SECTION_STRUCTURE,
|
---|
99 | "Section", "Section");
|
---|
100 | this.currentMets = this.metsStructure;
|
---|
101 | this.metsDocument = null;
|
---|
102 | }
|
---|
103 |
|
---|
104 | public void setMetsDocument(AbstractDocument document)
|
---|
105 | { this.metsDocument = document;
|
---|
106 | }
|
---|
107 |
|
---|
108 | public void setUrl(URL url)
|
---|
109 | { this.documentUrl = url;
|
---|
110 | }
|
---|
111 |
|
---|
112 | public Object findSections()
|
---|
113 | { this.findSections(document.getDocumentElement(), "/1");
|
---|
114 | return this.rootSection;
|
---|
115 | }
|
---|
116 |
|
---|
117 | public void findSections(Node node, String location)
|
---|
118 | { // if at an element, then we aren't going to find section information,
|
---|
119 | // but we do need to recurse into the children to find the section markers
|
---|
120 | // there
|
---|
121 | if (node.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
|
---|
122 | NodeList children = node.getChildNodes();
|
---|
123 | StringBuffer locationBuffer = new StringBuffer();
|
---|
124 | locationBuffer.append(location);
|
---|
125 | locationBuffer.append("/");
|
---|
126 |
|
---|
127 | int elementNo = 1;
|
---|
128 | int commentNo = 1;
|
---|
129 | for (int c = 0; c < children.getLength(); c++)
|
---|
130 | {
|
---|
131 | locationBuffer.setLength(location.length()+1);
|
---|
132 | if (children.item(c).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
|
---|
133 | locationBuffer.append(Integer.toString(elementNo));
|
---|
134 | elementNo ++;
|
---|
135 | }
|
---|
136 | else if (children.item(c).getNodeType() == org.w3c.dom.Node.COMMENT_NODE) {
|
---|
137 | locationBuffer.append("comment()[");
|
---|
138 | locationBuffer.append(Integer.toString(commentNo));
|
---|
139 | locationBuffer.append("]");
|
---|
140 | commentNo ++;
|
---|
141 | }
|
---|
142 | this.findSections(children.item(c), locationBuffer.toString());
|
---|
143 | }
|
---|
144 | }
|
---|
145 | // if at a comment node, then one needs to dissect it to discover if
|
---|
146 | // any section markings are included...
|
---|
147 | else if (node.getNodeType() == org.w3c.dom.Node.COMMENT_NODE) {
|
---|
148 | String commentText = node.getNodeValue();
|
---|
149 |
|
---|
150 | // parse the node now...checking first for any 'section' items in
|
---|
151 | // it - this is a crude check...
|
---|
152 | if (commentText.indexOf("<Section>") >= 0 ||
|
---|
153 | commentText.indexOf("</Section>") >= 0) {
|
---|
154 | // parse the section roughly using the primitive HTMLDoc form...
|
---|
155 | HTMLDoc innerDoc = HTMLDoc.fromString("dummy", commentText);
|
---|
156 | HTMLBlock innerHtml = innerDoc.getCodedContent();
|
---|
157 | boolean inMetadata = false;
|
---|
158 | StringBuffer metaValue = new StringBuffer();
|
---|
159 | String metaLabel = null;
|
---|
160 |
|
---|
161 | // Now just go through the content in sequence...
|
---|
162 | for (int e = 0; e < innerHtml.size(); e ++) {
|
---|
163 | Object element = innerHtml.elementAt(e);
|
---|
164 |
|
---|
165 | if (element instanceof HTMLTag) {
|
---|
166 | HTMLTag tag = (HTMLTag) element;
|
---|
167 |
|
---|
168 | if (tag.tagName().equals("section")) {
|
---|
169 | // get the counters sorted out for the sections...
|
---|
170 | this.sectionSeq.set(this.sectionLevel, new Integer(((Integer) this.sectionSeq.get(this.sectionLevel)).intValue() + 1));
|
---|
171 | this.sectionLevel ++;
|
---|
172 | if (sectionSeq.size() == this.sectionLevel) {
|
---|
173 | this.sectionSeq.add(new Integer(0));
|
---|
174 | }
|
---|
175 | else {
|
---|
176 | this.sectionSeq.set(this.sectionLevel, new Integer(0));
|
---|
177 | }
|
---|
178 |
|
---|
179 | // create the label for the new section...
|
---|
180 | this.sectionLabel.setLength(0);
|
---|
181 | for (int s = 0; s < sectionLevel; s ++) {
|
---|
182 | if (s > 0) {
|
---|
183 | this.sectionLabel.append(".");
|
---|
184 | }
|
---|
185 | this.sectionLabel.append(this.sectionSeq.get(s).toString());
|
---|
186 | }
|
---|
187 |
|
---|
188 | // note the new section in a new data structure...
|
---|
189 | HTMLSection newSection = new HTMLSection(this.sectionLabel.toString(), this.sectionSeqNo);
|
---|
190 |
|
---|
191 | newSection.setStartLocation(location);
|
---|
192 | this.currentSection.addChild(newSection);
|
---|
193 | this.currentSection = newSection;
|
---|
194 |
|
---|
195 | // do the same for mets...
|
---|
196 | METSDivision newDivision = new METSDivision("DS" + Integer.toString(this.sectionSeqNo), Integer.toString(this.sectionSeqNo),
|
---|
197 | this.sectionLabel.toString(), this.sectionLabel.toString(), "Section");
|
---|
198 | this.currentMets.addDivision(newDivision);
|
---|
199 | this.currentMets = newDivision;
|
---|
200 |
|
---|
201 | this.sectionSeqNo ++;
|
---|
202 | }
|
---|
203 | else if (tag.tagName().equals("/section")) {
|
---|
204 | // cope with poorly formed metadata...
|
---|
205 | if (inMetadata) {
|
---|
206 | this.postMetadata(metaLabel, metaValue.toString());
|
---|
207 | }
|
---|
208 |
|
---|
209 | // note the end position of the section
|
---|
210 | this.currentSection.setEndLocation(location);
|
---|
211 |
|
---|
212 | // now post that section to the file list...
|
---|
213 | this.postFilePtr(this.currentSection);
|
---|
214 |
|
---|
215 | // backtrack up the section tree...
|
---|
216 | this.currentSection = this.currentSection.getParent();
|
---|
217 | this.currentMets = this.currentMets.getParent();
|
---|
218 | this.sectionLevel --;
|
---|
219 |
|
---|
220 | // trim the section label
|
---|
221 | int cutTo = this.sectionLabel.toString().lastIndexOf('.');
|
---|
222 | if (cutTo >= 0) {
|
---|
223 | this.sectionLabel.setLength(cutTo);
|
---|
224 | }
|
---|
225 | else {
|
---|
226 | this.sectionLabel.setLength(0);
|
---|
227 | }
|
---|
228 |
|
---|
229 | // we can't be in metadata either...
|
---|
230 | inMetadata = false;
|
---|
231 | }
|
---|
232 | else if (tag.tagName().equals("metadata")) {
|
---|
233 | if (inMetadata) {
|
---|
234 | // post the metadata
|
---|
235 | this.postMetadata(metaLabel, metaValue.toString());
|
---|
236 | }
|
---|
237 | metaValue.setLength(0);
|
---|
238 | metaLabel = tag.idValue("name");
|
---|
239 | inMetadata = (metaLabel != null);
|
---|
240 | }
|
---|
241 | else if (tag.tagName().equals("/metadata")) {
|
---|
242 | // post the metadata
|
---|
243 | this.postMetadata(metaLabel, metaValue.toString());
|
---|
244 | inMetadata = false;
|
---|
245 | }
|
---|
246 | }
|
---|
247 | else {
|
---|
248 | if (inMetadata) {
|
---|
249 | metaValue.append(element.toString());
|
---|
250 | }
|
---|
251 | }
|
---|
252 | }
|
---|
253 | }
|
---|
254 | }
|
---|
255 | }
|
---|
256 |
|
---|
257 | private String makeXPointer(HTMLSection section)
|
---|
258 | { StringBuffer xPointerBuffer = new StringBuffer();
|
---|
259 |
|
---|
260 | if (this.documentUrl != null)
|
---|
261 | { xPointerBuffer.append(this.documentUrl.toString());
|
---|
262 | xPointerBuffer.append("#");
|
---|
263 | }
|
---|
264 | xPointerBuffer.append(section.getXPointer());
|
---|
265 |
|
---|
266 | return xPointerBuffer.toString();
|
---|
267 | }
|
---|
268 |
|
---|
269 | private void postFilePtr(HTMLSection section)
|
---|
270 | { String xPointer = this.makeXPointer(section);
|
---|
271 |
|
---|
272 | if (this.metsDocument == null) {
|
---|
273 | return;
|
---|
274 | }
|
---|
275 |
|
---|
276 | try {
|
---|
277 | URL url = new URL(xPointer);
|
---|
278 |
|
---|
279 | METSFileSet fileSet = this.metsDocument.getDocumentFiles();
|
---|
280 |
|
---|
281 | METSFilePos filePos = new METSFilePos(url);
|
---|
282 | METSFile file = fileSet.createFile(filePos, "text/html");
|
---|
283 |
|
---|
284 | METSFileGroup fileGroup;
|
---|
285 | String fileRef = ((METSDivision) this.currentMets).getDefaultFileReference();
|
---|
286 |
|
---|
287 | if (fileRef == null) {
|
---|
288 | fileGroup = fileSet.createGroup();
|
---|
289 | fileSet.addGroup(fileGroup);
|
---|
290 | ((METSDivision) this.currentMets).addFileReference(fileGroup.getId());
|
---|
291 | }
|
---|
292 | else {
|
---|
293 | // simple call - the group should be at the 'root' level, immediately below
|
---|
294 | // the FileSet itself...
|
---|
295 | fileGroup = fileSet.getGroup(fileRef);
|
---|
296 | }
|
---|
297 | fileGroup.addFile(file);
|
---|
298 |
|
---|
299 |
|
---|
300 | }
|
---|
301 | catch (java.net.MalformedURLException ex) {
|
---|
302 | System.err.println(ex + " " + xPointer);
|
---|
303 | }
|
---|
304 | }
|
---|
305 |
|
---|
306 | private void postMetadata(String label, String value)
|
---|
307 | {
|
---|
308 | if (this.metsDocument == null) {
|
---|
309 | return;
|
---|
310 | }
|
---|
311 |
|
---|
312 | String metadataRef = ((METSDivision) this.currentMets).getDefaultMetadataReference();
|
---|
313 | METSDescriptive descriptive;
|
---|
314 | if (metadataRef == null) {
|
---|
315 | descriptive = this.metsDocument.getDocumentMetadata().createDescriptive(this.currentSection.getLabel());
|
---|
316 | ((METSDivision) this.currentMets).addMetadataReference(descriptive.getID());
|
---|
317 | }
|
---|
318 | else {
|
---|
319 | descriptive = this.metsDocument.getDocumentMetadata().getDescriptiveById(metadataRef);
|
---|
320 | }
|
---|
321 | descriptive.addMetadata(MetadataLabel.getDefaultNamespace(), label, value);
|
---|
322 | }
|
---|
323 |
|
---|
324 | public METSStructure getStructure()
|
---|
325 | { return this.metsStructure;
|
---|
326 | }
|
---|
327 | }
|
---|
328 |
|
---|