Changeset 8710
- Timestamp:
- 2004-11-30T16:18:58+13:00 (19 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLParser.java
r5947 r8710 4 4 5 5 public class HTMLParser 6 { HTMLDoc document; 7 int pos; 8 int lastpos; 9 10 public HTMLParser(HTMLDoc document) 11 { this.document = document; 12 this.pos = 0; 13 } 14 15 public void startParse() 16 { this.pos = 0; 17 this.lastpos = -1; 18 // System.out.println("Starting "+document.urlString()); 19 } 20 21 public int atParse() 22 { return this.pos; 23 } 24 25 public int lastParse() 26 { return this.lastpos; 27 } 28 29 // do a full text/tag parse 30 public String fullParse() 31 { String reply = null; 32 int end; 33 int start; 34 35 this.lastpos = this.pos; 36 37 if (this.pos >= this.document.getContent().length()) 38 { return reply; 39 } 40 41 if (this.document.getContent().charAt(this.pos) == '<') 42 { 43 start = this.pos; 44 45 // if we're not at the end of the document, 46 // read the rest of the tag 47 if (this.pos == this.document.getContent().length() - 1) 48 { this.pos = this.document.getContent().length(); 49 return reply; 50 } 51 52 // if the tag is a comment 53 if (this.pos < this.document.getContent().length() - 3 && 54 this.document.getContent().substring(this.pos, this.pos+4).equals("<!--")) 55 { end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos; 56 reply = this.document.getContent().substring(this.pos, end); 57 this.pos = end; 58 } 59 else 60 { // read up to the end of the tag 61 end = this.pos + 1; 62 while (end < this.document.getContent().length() && 63 this.document.getContent().charAt(end) != '>') 64 { end ++; 65 } 66 67 // get the whole of the tag into 'reply', and 68 // set the current pos to immediately after the tag 69 if (end < this.document.getContent().length()) 70 { reply = this.document.getContent().substring(this.pos, end + 1); 71 this.pos = end + 1; 72 } 73 // patch the trailing > onto the tag string 6 { 7 HTMLDoc document; 8 int pos; 9 int lastpos; 10 11 public HTMLParser(HTMLDoc document) 12 { 13 this.document = document; 14 this.pos = 0; 15 } 16 17 public void startParse() 18 { 19 this.pos = 0; 20 this.lastpos = -1; 21 //System.out.println("Starting "+document.urlString()); 22 } 23 24 public int atParse() 25 { 26 return this.pos; 27 } 28 29 public int lastParse() 30 { 31 return this.lastpos; 32 } 33 34 // do a full text/tag parse 35 public String fullParse() 36 { 37 String reply = null; 38 int end; 39 int start; 40 41 this.lastpos = this.pos; 42 43 if (this.pos >= this.document.getContent().length()){ 44 return reply; 45 } 46 47 if (this.document.getContent().charAt(this.pos) == '<'){ 48 start = this.pos; 49 50 // if we're not at the end of the document, 51 // read the rest of the tag 52 if (this.pos == this.document.getContent().length() - 1){ 53 this.pos = this.document.getContent().length(); 54 return reply; 55 } 56 57 // if the tag is a comment 58 if (this.pos < this.document.getContent().length() - 3 && 59 this.document.getContent().substring(this.pos, this.pos+4).equals("<!--")){ 60 end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos; 61 reply = this.document.getContent().substring(this.pos, end); 62 this.pos = end; 63 } 64 else 65 { // read up to the end of the tag 66 end = this.pos + 1; 67 while (end < this.document.getContent().length() && 68 this.document.getContent().charAt(end) != '>'){ 69 end ++; 70 } 71 72 // get the whole of the tag into 'reply', and 73 // set the current pos to immediately after the tag 74 if (end < this.document.getContent().length()){ 75 reply = this.document.getContent().substring(this.pos, end + 1); 76 this.pos = end + 1; 77 } 78 // patch the trailing > onto the tag string 79 else { 80 reply = this.document.getContent().substring(this.pos, end) + ">"; 81 this.pos = end; 82 } 83 } 84 } 74 85 else 75 { reply = this.document.getContent().substring(this.pos, end) + ">"; 76 this.pos = end; 77 } 78 } 79 } 80 else 81 { // hunt for the beginning of the next tag 82 start = this.pos; 83 while ((this.pos < this.document.getContent().length()) && 84 (this.document.getContent().charAt(this.pos) != '<')) 85 { this.pos ++; 86 } 87 88 // return everything up to that tag 89 reply = this.document.getContent().substring(start, this.pos); 90 } 91 return reply; 92 } 93 94 // Get the next tag to parse 95 public String nextParse() 96 { String reply = null; 97 int end; 98 99 this.lastpos = this.pos; 100 101 if (this.document.getContent() == null) 102 { return null; 103 } 104 105 if (this.pos >= this.document.getContent().length()) 106 { return reply; 107 } 108 109 // hunt for the beginning of the next tag 86 { // hunt for the beginning of the next tag 87 start = this.pos; 110 88 while ((this.pos < this.document.getContent().length()) && 111 (this.document.getContent().charAt(this.pos) != '<')) 112 { this.pos ++; 113 } 114 115 // if we're not at the end of the document, 116 // read the rest of the tag 117 if (this.pos < this.document.getContent().length()) 118 { if (this.pos == this.document.getContent().length() - 1) 119 { this.pos = this.document.getContent().length(); 120 return reply; 89 (this.document.getContent().charAt(this.pos) != '<')){ 90 this.pos ++; 91 } 92 93 // return everything up to that tag 94 reply = this.document.getContent().substring(start, this.pos); 95 } 96 return reply; 97 } 98 99 // Get the next tag to parse 100 public String nextParse() 101 { 102 String reply = null; 103 int end; 104 105 this.lastpos = this.pos; 106 107 if (this.document.getContent() == null){ 108 return null; 109 } 110 111 if (this.pos >= this.document.getContent().length()){ 112 return reply; 113 } 114 115 // hunt for the beginning of the next tag 116 while ((this.pos < this.document.getContent().length()) && 117 (this.document.getContent().charAt(this.pos) != '<')){ 118 this.pos ++; 119 } 120 121 // if we're not at the end of the document, 122 // read the rest of the tag 123 if (this.pos < this.document.getContent().length()){ 124 if (this.pos == this.document.getContent().length() - 1){ 125 this.pos = this.document.getContent().length(); 126 return reply; 127 } 128 129 end = this.pos + 1; 130 while (end < this.document.getContent().length() && 131 this.document.getContent().charAt(end) != '>'){ 132 end ++; 133 } 134 135 // get the whole of the tag into 'reply', and 136 // set the current pos to immediately after the tag 137 if (end < this.document.getContent().length()){ 138 reply = this.document.getContent().substring(this.pos, end + 1); 139 this.pos = end + 1; 140 } 141 else{ 142 this.pos = end; 143 } 144 } 145 return reply; 146 } 147 148 149 /* -- 150 -- return next HREF value we come across in the document 151 -- 152 -- NB: the checking of the quotes etc is rather lazy - and should be tidied 153 -- 154 */ 155 public String nextHREF() 156 { 157 String reply; 158 int start, end; 159 boolean quoted; 160 HTMLTag tag; 161 162 reply = this.nextParse(); 163 while (this.pos < this.document.getContent().length()){ 164 tag = new HTMLTag(reply); 165 166 if (tag.tagName().equals("a")) { 167 start = reply.indexOf("href"); 168 if (start == -1) { 169 start = reply.indexOf("HREF"); 170 } 171 172 if (start >= 0) { 173 start += 4; 174 quoted = false; 175 while (reply.charAt(start) == ' ' || 176 reply.charAt(start) == '=' || 177 reply.charAt(start) == '"') { 178 if (reply.charAt(start) == '"') { 179 quoted = true; 121 180 } 122 123 end = this.pos + 1; 124 while ( end < this.document.getContent().length() && 125 this.document.getContent().charAt(end) != '>') 126 { end ++; 181 start ++; 182 } 183 184 end = -1; 185 if (quoted) { 186 end = reply.indexOf('"', start+1); 187 } 188 if (end == -1) { 189 end = reply.indexOf(' ', start+1); 190 if (end == -1) { 191 end = reply.length() - 1; 127 192 } 128 129 // get the whole of the tag into 'reply', and 130 // set the current pos to immediately after the tag 131 if (end < this.document.getContent().length()) 132 { reply = this.document.getContent().substring(this.pos, end + 1); 133 this.pos = end + 1; 134 } 135 else 136 { this.pos = end; 137 } 138 } 193 } 194 195 reply = reply.substring(start, end); 196 return reply; 197 } 198 } 199 reply = this.nextParse(); 200 } 201 return null; 202 } 203 204 // Return the next link in the page 205 public String nextLink(AppletContext ac) 206 { 207 String reply, reply2; 208 int start, end; 209 boolean quoted; 210 HTMLTag tag; 211 212 if (this.document.getContent() == null){ 213 return null; 214 } 215 216 reply = this.nextParse(); 217 while (this.pos < this.document.getContent().length()){ 218 tag = new HTMLTag(reply); 219 if (tag.tagName().equals("a")){ 220 reply = tag.idValue("href"); 221 } 222 else if (tag.tagName().equals("frame")){ // image tag 223 reply = tag.idValue("src"); 224 } 225 else if (tag.tagName().equals("area")) { // image map area 226 reply = tag.idValue("href"); 227 } 228 else if (tag.tagName().equals("frame")){ 229 reply = tag.idValue("src"); 230 } 231 else { 232 reply = null; 233 } 234 235 if (reply != null){ 139 236 return reply; 140 } 141 142 143 /* -- 144 -- return next HREF value we come across in the document 145 -- 146 -- NB: the checking of the quotes etc is rather lazy - and should be tidied 147 -- 148 */ 149 public String nextHREF() 150 { String reply; 151 int start, end; 152 boolean quoted; 153 HTMLTag tag; 154 155 reply = this.nextParse(); 156 while (this.pos < this.document.getContent().length()) 157 { tag = new HTMLTag(reply); 158 159 if (tag.tagName().equals("a")) 160 { start = reply.indexOf("href"); 161 if (start == -1) 162 { start = reply.indexOf("HREF"); 163 } 164 165 if (start >= 0) 166 { start += 4; 167 quoted = false; 168 while (reply.charAt(start) == ' ' || 169 reply.charAt(start) == '=' || 170 reply.charAt(start) == '"') 171 { if (reply.charAt(start) == '"') 172 { quoted = true; 173 } 174 start ++; 175 } 176 177 end = -1; 178 if (quoted) 179 { end = reply.indexOf('"', start+1); 180 } 181 if (end == -1) 182 { end = reply.indexOf(' ', start+1); 183 if (end == -1) 184 { end = reply.length() - 1; 185 } 186 } 187 188 reply = reply.substring(start, end); 189 return reply; 190 } 191 } 192 reply = this.nextParse(); 193 } 194 return null; 195 } 196 197 // Return the next link in the page 198 public String nextLink(AppletContext ac) 199 { String reply, reply2; 200 int start, end; 201 boolean quoted; 202 HTMLTag tag; 203 204 if (this.document.getContent() == null) 205 { return null; 206 } 207 208 reply = this.nextParse(); 209 while (this.pos < this.document.getContent().length()) 210 { tag = new HTMLTag(reply); 211 if (tag.tagName().equals("a")) 212 { reply = tag.idValue("href"); 213 } 214 else if (tag.tagName().equals("frame")) // image tag 215 { reply = tag.idValue("src"); 216 } 217 else if (tag.tagName().equals("area")) // image map area 218 { reply = tag.idValue("href"); 219 } 220 else if (tag.tagName().equals("frame")) 221 { reply = tag.idValue("src"); 222 } 223 else 224 { reply = null; 225 } 226 227 if (reply != null) 228 { return reply; 229 } 230 231 reply = this.nextParse(); 232 } 233 return null; 234 } 237 } 238 239 reply = this.nextParse(); 240 } 241 return null; 242 } 235 243 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java
r8705 r8710 44 44 { 45 45 String comment = new String(ch, start, length); 46 47 46 parent.comment(comment); 48 47 } … … 182 181 tagName.equals("link") || 183 182 tagName.equals("base") || 184 tagName.equals("img") ||183 //tagName.equals("img") || 185 184 tagName.equals("hr")) { 186 185 isSingleton = true;
Note:
See TracChangeset
for help on using the changeset viewer.