source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocumentTools.java@ 6284

Last change on this file since 6284 was 6284, checked in by cs025, 20 years ago

Added HTMLDocumentTools, also modifications to the abstract interfaces
and the HTMLDocument doctype to support indexing by section.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.0 KB
Line 
1package org.greenstone.gsdl3.gs3build.doctypes;
2
3import org.w3c.dom.*;
4
5import java.net.URL;
6
7import java.util.List;
8import java.util.ArrayList;
9
10import org.greenstone.gsdl3.gs3build.metadata.METSStructureSet;
11import org.greenstone.gsdl3.gs3build.metadata.AbstractStructure;
12import org.greenstone.gsdl3.gs3build.metadata.METSStructure;
13import org.greenstone.gsdl3.gs3build.metadata.METSDivision;
14import org.greenstone.gsdl3.gs3build.metadata.METSDescriptive;
15import org.greenstone.gsdl3.gs3build.metadata.METSFileSet;
16import org.greenstone.gsdl3.gs3build.metadata.METSFileGroup;
17import org.greenstone.gsdl3.gs3build.metadata.METSFile;
18import org.greenstone.gsdl3.gs3build.metadata.METSFilePos;
19import org.greenstone.gsdl3.gs3build.metadata.METSFileID;
20import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
21import org.greenstone.gsdl3.gs3build.doctypes.AbstractDocument;
22import org.greenstone.gsdl3.gs3build.util.*;
23
24public class HTMLDocumentTools
25{
26 Document document;
27 URL documentUrl;
28 int sectionLevel;
29 StringBuffer sectionLabel;
30 List sectionSeq;
31 HTMLSection rootSection;
32 HTMLSection currentSection;
33 int sectionSeqNo;
34 METSStructure metsStructure;
35 AbstractStructure currentMets;
36 AbstractDocument metsDocument;
37
38 /**
39 *
40 */
41 class HTMLSection
42 { String startLocation;
43 String endLocation;
44 String label;
45 List children;
46 HTMLSection parent;
47 int seqNo;
48
49 public HTMLSection(String label, int seqNo)
50 { this.label = label;
51 this.seqNo = seqNo;
52 this.parent = null;
53 this.children = new ArrayList();
54 }
55
56 public void setStartLocation(String location)
57 { this.startLocation = location;
58 }
59
60 public void setEndLocation(String location)
61 { this.endLocation = location;
62 }
63
64 public void addChild(HTMLSection child)
65 { this.children.add(child);
66 child.parent = this;
67 }
68
69 public HTMLSection getParent()
70 { return this.parent;
71 }
72
73 public String getLabel()
74 { return this.label;
75 }
76
77 public String getXPointer()
78 { StringBuffer xPointerBuffer = new StringBuffer();
79
80 xPointerBuffer.append("xpointer(");
81 xPointerBuffer.append(this.startLocation);
82 xPointerBuffer.append("/range-to(");
83 xPointerBuffer.append(this.endLocation);
84 xPointerBuffer.append("))");
85
86 return xPointerBuffer.toString();
87 }
88 }
89
90 public HTMLDocumentTools(Document document)
91 { this.document = document;
92 this.sectionLevel = 0;
93 this.sectionSeq = new ArrayList();
94 this.sectionSeq.add(new Integer(0));
95 this.sectionLabel = new StringBuffer();
96 this.rootSection = new HTMLSection("", 0);
97 this.currentSection = this.rootSection;
98 this.metsStructure = new METSStructure(METSStructureSet.GSDL3_SECTION_STRUCTURE,
99 "Section", "Section");
100 this.currentMets = this.metsStructure;
101 this.metsDocument = null;
102 }
103
104 public void setMetsDocument(AbstractDocument document)
105 { this.metsDocument = document;
106 }
107
108 public void setUrl(URL url)
109 { this.documentUrl = url;
110 }
111
112 public Object findSections()
113 { this.findSections(document.getDocumentElement(), "/1");
114 return this.rootSection;
115 }
116
117 public void findSections(Node node, String location)
118 { // if at an element, then we aren't going to find section information,
119 // but we do need to recurse into the children to find the section markers
120 // there
121 if (node.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
122 NodeList children = node.getChildNodes();
123 StringBuffer locationBuffer = new StringBuffer();
124 locationBuffer.append(location);
125 locationBuffer.append("/");
126
127 int elementNo = 1;
128 int commentNo = 1;
129 for (int c = 0; c < children.getLength(); c++)
130 {
131 locationBuffer.setLength(location.length()+1);
132 if (children.item(c).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
133 locationBuffer.append(Integer.toString(elementNo));
134 elementNo ++;
135 }
136 else if (children.item(c).getNodeType() == org.w3c.dom.Node.COMMENT_NODE) {
137 locationBuffer.append("comment()[");
138 locationBuffer.append(Integer.toString(commentNo));
139 locationBuffer.append("]");
140 commentNo ++;
141 }
142 this.findSections(children.item(c), locationBuffer.toString());
143 }
144 }
145 // if at a comment node, then one needs to dissect it to discover if
146 // any section markings are included...
147 else if (node.getNodeType() == org.w3c.dom.Node.COMMENT_NODE) {
148 String commentText = node.getNodeValue();
149
150 // parse the node now...checking first for any 'section' items in
151 // it - this is a crude check...
152 if (commentText.indexOf("<Section>") >= 0 ||
153 commentText.indexOf("</Section>") >= 0) {
154 // parse the section roughly using the primitive HTMLDoc form...
155 HTMLDoc innerDoc = HTMLDoc.fromString("dummy", commentText);
156 HTMLBlock innerHtml = innerDoc.getCodedContent();
157 boolean inMetadata = false;
158 StringBuffer metaValue = new StringBuffer();
159 String metaLabel = null;
160
161 // Now just go through the content in sequence...
162 for (int e = 0; e < innerHtml.size(); e ++) {
163 Object element = innerHtml.elementAt(e);
164
165 if (element instanceof HTMLTag) {
166 HTMLTag tag = (HTMLTag) element;
167
168 if (tag.tagName().equals("section")) {
169 // get the counters sorted out for the sections...
170 this.sectionSeq.set(this.sectionLevel, new Integer(((Integer) this.sectionSeq.get(this.sectionLevel)).intValue() + 1));
171 this.sectionLevel ++;
172 if (sectionSeq.size() == this.sectionLevel) {
173 this.sectionSeq.add(new Integer(0));
174 }
175 else {
176 this.sectionSeq.set(this.sectionLevel, new Integer(0));
177 }
178
179 // create the label for the new section...
180 this.sectionLabel.setLength(0);
181 for (int s = 0; s < sectionLevel; s ++) {
182 if (s > 0) {
183 this.sectionLabel.append(".");
184 }
185 this.sectionLabel.append(this.sectionSeq.get(s).toString());
186 }
187
188 // note the new section in a new data structure...
189 HTMLSection newSection = new HTMLSection(this.sectionLabel.toString(), this.sectionSeqNo);
190
191 newSection.setStartLocation(location);
192 this.currentSection.addChild(newSection);
193 this.currentSection = newSection;
194
195 // do the same for mets...
196 METSDivision newDivision = new METSDivision("DS" + Integer.toString(this.sectionSeqNo), Integer.toString(this.sectionSeqNo),
197 this.sectionLabel.toString(), this.sectionLabel.toString(), "Section");
198 this.currentMets.addDivision(newDivision);
199 this.currentMets = newDivision;
200
201 this.sectionSeqNo ++;
202 }
203 else if (tag.tagName().equals("/section")) {
204 // cope with poorly formed metadata...
205 if (inMetadata) {
206 this.postMetadata(metaLabel, metaValue.toString());
207 }
208
209 // note the end position of the section
210 this.currentSection.setEndLocation(location);
211
212 // now post that section to the file list...
213 this.postFilePtr(this.currentSection);
214
215 // backtrack up the section tree...
216 this.currentSection = this.currentSection.getParent();
217 this.currentMets = this.currentMets.getParent();
218 this.sectionLevel --;
219
220 // trim the section label
221 int cutTo = this.sectionLabel.toString().lastIndexOf('.');
222 if (cutTo >= 0) {
223 this.sectionLabel.setLength(cutTo);
224 }
225 else {
226 this.sectionLabel.setLength(0);
227 }
228
229 // we can't be in metadata either...
230 inMetadata = false;
231 }
232 else if (tag.tagName().equals("metadata")) {
233 if (inMetadata) {
234 // post the metadata
235 this.postMetadata(metaLabel, metaValue.toString());
236 }
237 metaValue.setLength(0);
238 metaLabel = tag.idValue("name");
239 inMetadata = (metaLabel != null);
240 }
241 else if (tag.tagName().equals("/metadata")) {
242 // post the metadata
243 this.postMetadata(metaLabel, metaValue.toString());
244 inMetadata = false;
245 }
246 }
247 else {
248 if (inMetadata) {
249 metaValue.append(element.toString());
250 }
251 }
252 }
253 }
254 }
255 }
256
257 private String makeXPointer(HTMLSection section)
258 { StringBuffer xPointerBuffer = new StringBuffer();
259
260 if (this.documentUrl != null)
261 { xPointerBuffer.append(this.documentUrl.toString());
262 xPointerBuffer.append("#");
263 }
264 xPointerBuffer.append(section.getXPointer());
265
266 return xPointerBuffer.toString();
267 }
268
269 private void postFilePtr(HTMLSection section)
270 { String xPointer = this.makeXPointer(section);
271
272 if (this.metsDocument == null) {
273 return;
274 }
275
276 try {
277 URL url = new URL(xPointer);
278
279 METSFileSet fileSet = this.metsDocument.getDocumentFiles();
280
281 METSFilePos filePos = new METSFilePos(url);
282 METSFile file = fileSet.createFile(filePos, "text/html");
283
284 METSFileGroup fileGroup;
285 String fileRef = ((METSDivision) this.currentMets).getDefaultFileReference();
286
287 if (fileRef == null) {
288 fileGroup = fileSet.createGroup();
289 fileSet.addGroup(fileGroup);
290 ((METSDivision) this.currentMets).addFileReference(fileGroup.getId());
291 }
292 else {
293 // simple call - the group should be at the 'root' level, immediately below
294 // the FileSet itself...
295 fileGroup = fileSet.getGroup(fileRef);
296 }
297 fileGroup.addFile(file);
298
299
300 }
301 catch (java.net.MalformedURLException ex) {
302 System.err.println(ex + " " + xPointer);
303 }
304 }
305
306 private void postMetadata(String label, String value)
307 {
308 if (this.metsDocument == null) {
309 return;
310 }
311
312 String metadataRef = ((METSDivision) this.currentMets).getDefaultMetadataReference();
313 METSDescriptive descriptive;
314 if (metadataRef == null) {
315 descriptive = this.metsDocument.getDocumentMetadata().createDescriptive(this.currentSection.getLabel());
316 ((METSDivision) this.currentMets).addMetadataReference(descriptive.getID());
317 }
318 else {
319 descriptive = this.metsDocument.getDocumentMetadata().getDescriptiveById(metadataRef);
320 }
321 descriptive.addMetadata(MetadataLabel.getDefaultNamespace(), label, value);
322 }
323
324 public METSStructure getStructure()
325 { return this.metsStructure;
326 }
327}
328
Note: See TracBrowser for help on using the repository browser.