- Timestamp:
- 2003-11-24T14:26:35+13:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java
r5800 r5944 14 14 public class HTMLDocument extends AbstractDocument 15 15 { 16 16 public static final String HTML_DOCUMENT_TYPE = "HTML"; 17 17 18 /** 19 * Create the HTMLDocument from a given URL - the URL may in fact be a reference 20 * to a local file. 21 * 22 * @param <code>URL</code> The location from which to load the file 23 */ 24 public HTMLDocument(URL url) 25 { super(url); 18 public HTMLDocument(DocumentID id) 19 { super(id); 20 } 26 21 27 HTMLDoc htmlDoc; 28 if (url.toString().startsWith("file://")) 29 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 30 } 31 else 32 { htmlDoc = new HTMLDoc(url); 33 } 22 /** 23 * Create the HTMLDocument from a given URL - the URL may in fact be a reference 24 * to a local file. 25 * 26 * @param <code>URL</code> The location from which to load the file 27 */ 28 public HTMLDocument(URL url) 29 { super(url); 34 30 35 this._extractDocumentFiles(htmlDoc); 36 this._extractDocumentMetadata(htmlDoc); 31 HTMLDoc htmlDoc; 32 if (url.toString().startsWith("file://")) 33 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 34 } 35 else if (url.toString().startsWith("file:/")) 36 { htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 37 } 38 else 39 { htmlDoc = new HTMLDoc(url); 40 } 41 42 this._extractDocumentFiles(htmlDoc); 43 this._extractDocumentMetadata(htmlDoc); 44 } 45 46 private void _extractDocumentMetadata(HTMLDoc htmlDoc) 47 { HTMLBlock codedContent = htmlDoc.getCodedContent(); 48 boolean inTitle = false; 49 StringBuffer title = new StringBuffer(); 50 51 for (int e = 0; e < codedContent.size(); e ++) 52 { if (codedContent.elementAt(e) instanceof HTMLTag) 53 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 54 55 if (tag.tagName().equals("meta")) 56 { // check that the name of the metadata item exists 57 String name = tag.idValue("name"); 58 if (name == null || name.length() == 0) { 59 continue; 60 } 61 62 // get the value, if it exists 63 String value = tag.idValue("content"); 64 if (value != null && value.length() > 0) { 65 System.out.println(" " + value); 66 } 67 // if value does not exist, default it to being the same 68 // as the name. 69 else { 70 value = name; 71 } 72 73 this.addDocumentMetadata(name, value); 37 74 } 75 else if (tag.tagName().equals("title")) 76 { inTitle = true; 77 } 78 else if (tag.tagName().equals("/title")) 79 { inTitle = false; 80 } 81 // cut off when real body content appears - not a perfect 82 // implementation, just cheap & cheerful 83 else if (tag.tagName().equals("/head")) 84 { break; 85 } 86 else if (tag.tagName().equals("body")) 87 { break; 88 } 89 } 90 else if (inTitle == true) 91 { title.append(codedContent.elementAt(e).toString()); 92 } 93 } 94 if (title.length() > 0) 95 { this.addDocumentMetadata("title", title.toString()); 96 } 97 } 38 98 39 private void _extractDocumentMetadata(HTMLDoc htmlDoc) 40 { HTMLBlock codedContent = htmlDoc.getCodedContent(); 41 boolean inTitle = false; 42 StringBuffer title = new StringBuffer(); 99 private void _extractDocumentFiles(HTMLDoc htmlDoc) 100 { URL homeUrl = this.fileSet.getFile(0).getLocation(); 43 101 44 for (int e = 0; e < codedContent.size(); e ++) 45 { if (codedContent.elementAt(e) instanceof HTMLTag) 46 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 102 HTMLBlock codedContent = htmlDoc.getCodedContent(); 103 for (int e = 0; e < codedContent.size(); e ++) 104 { if (codedContent.elementAt(e) instanceof HTMLTag) 105 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 47 106 48 if (tag.tagName().equals("meta")) 49 { // check that the name of the metadata item exists 50 String name = tag.idValue("name"); 51 if (name == null || name.length() == 0) { 52 continue; 53 } 107 if (tag.tagName().equals("img")) 108 { String location = tag.idValue("src"); 54 109 55 // get the value, if it exists 56 String value = tag.idValue("content"); 57 if (value != null && value.length() > 0) { 58 System.out.println(" " + value); 59 } 60 // if value does not exist, default it to being the same 61 // as the name. 62 else { 63 value = name; 64 } 65 66 this.addDocumentMetadata(name, value); 67 } 68 else if (tag.tagName().equals("title")) 69 { inTitle = true; 70 } 71 else if (tag.tagName().equals("/title")) 72 { inTitle = false; 73 } 74 // cut off when real body content appears - not a perfect 75 // implementation, just cheap & cheerful 76 else if (tag.tagName().equals("/head")) 77 { break; 78 } 79 else if (tag.tagName().equals("body")) 80 { break; 81 } 82 } 83 else if (inTitle == true) 84 { title.append(codedContent.elementAt(e).toString()); 85 } 86 } 87 if (title.length() > 0) 88 { this.addDocumentMetadata("title", title.toString()); 89 } 90 } 91 92 private void _extractDocumentFiles(HTMLDoc htmlDoc) 93 { URL homeUrl = this.fileSet.getFile(0).getLocation(); 94 95 HTMLBlock codedContent = htmlDoc.getCodedContent(); 96 for (int e = 0; e < codedContent.size(); e ++) 97 { if (codedContent.elementAt(e) instanceof HTMLTag) 98 { HTMLTag tag = (HTMLTag) codedContent.elementAt(e); 99 100 if (tag.tagName().equals("img")) 101 { String location = tag.idValue("src"); 102 103 try 104 { // make the url for the image, and then add it to the document list of 105 // 106 URL imgUrl = new URL(homeUrl, location); 107 METSFile file = this.fileSet.addFile(imgUrl); 108 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID()); 109 } 110 catch (MalformedURLException ex) 111 { // TODO: report exception/failure to resolve... 112 } 110 try 111 { // make the url for the image, and then add it to the document list of 112 // 113 URL imgUrl = new URL(homeUrl, location); 114 METSFile file = this.fileSet.addFile(imgUrl); 115 this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID()); 116 } 117 catch (MalformedURLException ex) 118 { // TODO: report exception/failure to resolve... 113 119 } 114 120 } 115 121 } 116 122 } 123 } 117 124 118 public String getDocumentType() 119 { return HTML_DOCUMENT_TYPE; 125 public String getDocumentType() 126 { return HTML_DOCUMENT_TYPE; 127 } 128 129 public String getDocumentText() 130 { 131 HTMLDoc htmlDoc; 132 URL url =(URL) this.fileSet.getFile(0).getLocation(); 133 134 if (url.toString().startsWith("file://")) 135 { htmlDoc = new HTMLDoc(url, url.toString().substring(7)); 120 136 } 121 122 public String getDocumentText() 123 { HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7)); 124 return htmlDoc.getContent(); 137 else if (url.toString().startsWith("file:/")) 138 { htmlDoc = new HTMLDoc(url, url.toString().substring(5)); 125 139 } 140 else 141 { htmlDoc = new HTMLDoc(url); 142 } 143 return htmlDoc.getContent(); 144 } 126 145 }
Note:
See TracChangeset
for help on using the changeset viewer.