Changeset 6284
- Timestamp:
- 2003-12-17T13:08:02+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java
r6101 r6284 13 13 14 14 import org.greenstone.gsdl3.gs3build.metadata.NamespaceFactory; 15 import org.greenstone.gsdl3.gs3build.metadata.StructureIdentifierFactory; 15 16 import org.greenstone.gsdl3.gs3build.metadata.GSDL3Namespace; 16 17 import org.greenstone.gsdl3.gs3build.metadata.METSDescriptiveSet; … … 22 23 import org.greenstone.gsdl3.gs3build.metadata.METSDivision; 23 24 import org.greenstone.gsdl3.gs3build.metadata.METSNamespace; 25 import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel; 24 26 25 27 import org.greenstone.gsdl3.gs3build.util.MultiMap; … … 39 41 DocumentID id; 40 42 boolean isModified; 43 StructureIdentifierFactory structureIdFactory; 41 44 42 45 /** … … 53 56 this.structureSet = new METSStructureSet(); 54 57 this.id = id; 58 this.structureIdFactory = new StructureIdentifierFactory(); 55 59 } 56 60 … … 152 156 * @see DocumentInterface:addDocumentMetadata 153 157 */ 154 public void addDocumentMetadata(String name, String value) 155 { int colonAt = name.indexOf(":"); 156 String namespace; 157 158 if (colonAt > 0) { 159 namespace = name.substring(0, colonAt); 160 name = name.substring(colonAt+1); 161 } 162 else { 163 namespace = GSDL3Namespace.GSDL3_NAMESPACE_ID; 164 } 165 166 // no need to set isModified, as the following call will do it anyway! 167 this.addDocumentMetadata(namespace, name, value); 158 public void addDocumentMetadata(MetadataLabel label, String value) 159 { // no need to set isModified, as the following call will do it anyway! 160 this.addDocumentMetadata(label.getNamespace(), label.getLabel(), value); 168 161 } 169 162 … … 286 279 { this.fileSet = fileSet; 287 280 } 281 282 /** 283 * This is just a dummy function that does nothing at this level... 284 */ 285 public org.w3c.dom.Document getDOMDocument() 286 { return null; 287 } 288 288 289 289 /** -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentInterface.java
r6101 r6284 57 57 */ 58 58 public String getDocumentText(); 59 60 /** 61 * The document as a dom object 62 */ 63 public org.w3c.dom.Document getDOMDocument(); 59 64 60 65 /** … … 113 118 * the label. 114 119 * 115 * @param <code> String</code> label of the metadata, with a '.' to deliminate120 * @param <code>MetadataLabel</code> label of the metadata, with a '.' to deliminate 116 121 * sub-component structures. The label may commence 117 122 * with a namespace followed by a colon. … … 119 124 * @param <code>String</code> value of the metadata 120 125 */ 121 public void addDocumentMetadata( Stringlabel, String value);126 public void addDocumentMetadata(MetadataLabel label, String value); 122 127 123 128 /** … … 132 137 public void addDocumentMetadata(String namespace, String label, String value); 133 138 134 135 139 /** 136 140 * Post metadata to a file in this document - the appropriate changes -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java
r6101 r6284 9 9 import java.util.ArrayList; 10 10 11 import org.greenstone.gsdl3.gs3build.metadata. METSFile;11 import org.greenstone.gsdl3.gs3build.metadata.*; 12 12 import org.greenstone.gsdl3.gs3build.util.*; 13 import org.greenstone.gsdl3.gs3build.xpointer.XPointer; 14 15 import org.w3c.dom.*; 13 16 14 17 public class HTMLDocument extends AbstractDocument 15 18 { 16 19 public static final String HTML_DOCUMENT_TYPE = "HTML"; 20 Document domDocument; 17 21 18 22 public HTMLDocument(DocumentID id) 19 23 { super(id); 24 this.domDocument = null; 20 25 } 21 26 … … 29 34 { super(url); 30 35 31 HTMLDoc htmlDoc; 36 this.loadDocument(url); 37 38 this._extractDocumentFiles(); 39 this._extractDocumentMetadata(); 40 41 HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument); 42 docTools.setMetsDocument(this); 43 docTools.setUrl(this.fileSet.getFile(0).getLocation()); 44 docTools.findSections(); 45 METSStructure sectionStruct = docTools.getStructure(); 46 47 if (sectionStruct.size() > 0) { 48 METSStructureSet structureSet = this.getDocumentStructure(); 49 structureSet.addStructure(sectionStruct); 50 } 51 } 52 53 private void loadDocument(URL url) 54 { 55 // HTMLDoc htmlDoc; 56 HTMLTidy tidyDoc; 32 57 if (url.toString().startsWith("file://")) 33 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 58 { //htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 59 tidyDoc = new HTMLTidy(new File(url.toString().substring(7))); 34 60 } 35 61 else if (url.toString().startsWith("file:/")) 36 { htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 62 { //htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 63 tidyDoc = new HTMLTidy(new File( url.toString().substring(5))); 37 64 } 38 65 else 39 { htmlDoc = new HTMLDoc(url); 66 { //htmlDoc = new HTMLDoc(url); 67 tidyDoc = new HTMLTidy(url); 40 68 } 41 69 42 this._extractDocumentFiles(htmlDoc); 43 this._extractDocumentMetadata(htmlDoc); 44 } 45 46 private void _extractDocumentMetadata(HTMLDoc htmlDoc) 47 { HTMLBlock codedContent = htmlDoc.getCodedContent(); 70 this.domDocument = tidyDoc.getDocument(); 71 } 72 73 private void _extractDocumentMetadata() 74 { 75 NodeList metadata = this.domDocument.getElementsByTagName("META"); 76 for (int n = 0; n < metadata.getLength(); n ++) { 77 Node node = metadata.item(n); 78 Element element = (Element) node; 79 80 String name = element.getAttribute("name"); 81 if (name == null || name.length() == 0) { 82 continue; 83 } 84 85 String value = element.getAttribute("content"); 86 if (value == null || value.length() == 0) { 87 value = name; 88 } 89 90 this.addDocumentMetadata(new MetadataLabel(name), value); 91 } 92 93 NodeList titles = this.domDocument.getElementsByTagName("TITLE"); 94 StringBuffer title = new StringBuffer(); 95 for (int n = 0; n < titles.getLength(); n ++) { 96 Node node = titles.item(n); 97 Element element = (Element) node; 98 99 NodeList childNodes = node.getChildNodes(); 100 for (int c = 0; c < childNodes.getLength(); c ++) { 101 Node child = childNodes.item(c); 102 if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) { 103 title.append(child.getNodeValue()); 104 } 105 } 106 } 107 108 if (title.length() > 0) 109 { this.addDocumentMetadata(new MetadataLabel("title"), title.toString()); 110 } 111 112 /* Old HTMLDoc based parsing... 113 HTMLBlock codedContent = htmlDoc.getCodedContent(); 48 114 boolean inTitle = false; 49 StringBuffertitle = new StringBuffer();115 title = new StringBuffer(); 50 116 51 117 for (int e = 0; e < codedContent.size(); e ++) … … 62 128 // get the value, if it exists 63 129 String value = tag.idValue("content"); 64 if (value != null && value.length() > 0) { 65 System.out.println(" " + value); 66 } 130 67 131 // if value does not exist, default it to being the same 68 132 // as the name. 69 else{133 if (value == null || value.length() > 0) { 70 134 value = name; 71 135 } 72 73 this.addDocumentMetadata(name, value); 74 } 75 else if (tag.tagName().equals("title")) 136 } 137 else 138 if (tag.tagName().equals("title")) 76 139 { inTitle = true; 77 140 } … … 92 155 } 93 156 } 94 if (title.length() > 0) 95 { this.addDocumentMetadata("title", title.toString()); 96 } 97 } 98 99 private void _extractDocumentFiles(HTMLDoc htmlDoc) 157 */ 158 } 159 160 private void _extractDocumentFiles() 100 161 { URL homeUrl = this.fileSet.getFile(0).getLocation(); 101 162 163 NodeList metadata = this.domDocument.getElementsByTagName("img"); 164 for (int n = 0; n < metadata.getLength(); n ++) { 165 Node node = metadata.item(n); 166 Element element = (Element) node; 167 168 String location = element.getAttribute("src"); 169 if (location == null || location.length() == 0) { 170 System.out.println("No name"); 171 continue; 172 } 173 174 try 175 { // make the url for the image, and then add it to the document list of 176 // 177 URL imgUrl = new URL(homeUrl, location); 178 METSFile file = this.fileSet.addFile(imgUrl); 179 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID()); 180 } 181 catch (MalformedURLException ex) 182 { // TODO: report exception/failure to resolve... 183 } 184 } 185 186 /** 102 187 HTMLBlock codedContent = htmlDoc.getCodedContent(); 103 188 for (int e = 0; e < codedContent.size(); e ++) … … 121 206 } 122 207 } 208 */ 123 209 } 124 210 … … 132 218 URL url =(URL) this.fileSet.getFile(0).getLocation(); 133 219 220 this.getSectionText("1"); 221 134 222 if (url.toString().startsWith("file://")) 135 223 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); … … 144 232 } 145 233 146 public String getSectionText(String document) 147 { return ""; 234 public Document getDOMDocument() 235 { 236 if (this.domDocument == null) { 237 URL url =(URL) this.fileSet.getFile(0).getLocation(); 238 this.loadDocument(url); 239 } 240 return this.domDocument; 241 } 242 243 private XPointer getXPointer(METSDivision division) 244 { String fileId = division.getDefaultFileReference(); 245 246 if (fileId == null) { 247 System.err.println("Unable to obtain file reference for section " + division.getLabel()); 248 return null; 249 } 250 251 METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId); 252 if (fileGroup == null) { 253 System.err.println("Unable to obtain file reference for filegroup " + fileId); 254 return null; 255 } 256 257 METSFile file = fileGroup.getFile(0); 258 if (file == null) { 259 System.err.println("Unable to obtain any files within filegroup " + fileId); 260 return null; 261 } 262 263 URL url = file.getLocation(); 264 XPointer xpointer = XPointer.processXPointer(this.domDocument, url); 265 266 return xpointer; 267 } 268 269 private XPointer getXPointer(String sectionId) 270 { if (this.domDocument == null) { 271 URL url =(URL) this.fileSet.getFile(0).getLocation(); 272 this.loadDocument(url); 273 } 274 275 METSDivision division = 276 this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE, 277 sectionId); 278 if (division == null) { 279 // System.err.println("Unable to locate section " + sectionId); 280 return null; 281 } 282 283 return this.getXPointer(division); 284 } 285 286 public Node getSectionStartNode(METSDivision division) 287 { XPointer xpointer = this.getXPointer(division); 288 return xpointer.getStartNode(); 289 } 290 291 public Node getSectionStartNode(String sectionId) 292 { XPointer xpointer = this.getXPointer(sectionId); 293 294 return xpointer.getStartNode(); 295 } 296 297 public String getSectionText(String sectionId) 298 { XPointer xpointer = this.getXPointer(sectionId); 299 300 if (xpointer == null) { 301 return ""; 302 } 303 304 // get the XML content of the xpointer... 305 return xpointer.toString(); 148 306 } 149 307 }
Note:
See TracChangeset
for help on using the changeset viewer.