Changeset 8709
- Timestamp:
- 2004-11-30T15:42:28+13:00 (19 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLBlock.java
r8476 r8709 4 4 5 5 public class HTMLBlock 6 { Vector content; 7 int tagpos; 8 int endpos; 9 10 public HTMLBlock(int docpos) 11 { this.content = new Vector(1); 12 this.tagpos = docpos; 13 } 14 15 public HTMLBlock(int docpos, int endpos) 16 { this.content = new Vector(1); 17 this.tagpos = docpos; 18 this.endpos = endpos; 19 } 20 21 public void addTag(HTMLTag tag) 22 { if (tag.endPos() > this.endpos) 23 { this.endpos = tag.endPos(); 24 } 25 content.addElement(tag); 26 } 27 28 /** 29 * @deprecccate 30 public void addText(String text) 31 { content.addElement(text); 32 } 33 */ 34 35 public void addText(HTMLCText text) 36 { if (text.endPos() > this.endpos) 37 { this.endpos = text.endPos(); 38 } 39 content.addElement(text); 40 } 41 42 /** 43 * @return HTML encoded <code>String</code> of the document 44 */ 45 public String contentString() 46 { int member; 47 StringBuffer reply; 48 String type; 49 50 reply = new StringBuffer(); 51 for (member = 1; member < content.size() - 1; member ++) 52 { /*if (content.elementAt(member) instanceof String) 53 { reply.append((String) content.elementAt(member)); 54 } 55 else 56 */ 57 if (content.elementAt(member) instanceof HTMLCText) 58 { reply.append(((HTMLCText) content.elementAt(member)).toString()); 59 } 60 } 61 return reply.toString(); 62 } 63 64 /** 65 * Returns document position of start of the block 66 */ 67 public int startPos() 68 { return this.tagpos; 69 } 70 71 /** 72 * Returns document position of end of the block 73 */ 74 public int endPos() 75 { return this.tagpos; 76 } 77 78 /** 79 * @return head (first) tag of the block 80 */ 81 public HTMLTag headTag() 82 { return ((HTMLTag) content.elementAt(0)); 83 } 84 85 /** 86 * @return name (as a <code>String</code>) of the tag at the head of the block 87 */ 88 public String headTagName() 89 { return ((HTMLTag) content.elementAt(0)).tagName(); 90 } 91 92 /** 93 * @return name of the tail (last) tag of the block as a <code>String</code> 94 */ 95 public String tailTagName() 96 { return (((HTMLTag) content.elementAt(content.size() - 1)).tagName()); 97 } 98 99 100 /** 101 * @return: starting element character position of the item-th item 102 */ 103 private int elementStartPos(int item) 104 { Object element; 105 int pos = -1; 6 { 7 Vector content; 8 int tagpos; 9 int endpos; 10 11 public HTMLBlock(int docpos) 12 { 13 this.content = new Vector(1); 14 this.tagpos = docpos; 15 } 16 17 public HTMLBlock(int docpos, int endpos) 18 { 19 this.content = new Vector(1); 20 this.tagpos = docpos; 21 this.endpos = endpos; 22 } 23 24 public void addTag(HTMLTag tag) 25 { 26 if (tag.endPos() > this.endpos){ 27 this.endpos = tag.endPos(); 28 } 29 content.addElement(tag); 30 } 31 32 /** 33 * @deprecccate 34 public void addText(String text) 35 { content.addElement(text); 36 } 37 */ 38 39 public void addText(HTMLCText text) 40 { 41 if (text.endPos() > this.endpos){ 42 this.endpos = text.endPos(); 43 } 44 content.addElement(text); 45 } 46 47 /** 48 * @return HTML encoded <code>String</code> of the document 49 */ 50 public String contentString() 51 { 52 int member; 53 StringBuffer reply; 54 String type; 55 56 reply = new StringBuffer(); 57 for (member = 1; member < content.size() - 1; member ++){ 58 /*if (content.elementAt(member) instanceof String){ 59 reply.append((String) content.elementAt(member)); 60 } 61 else 62 */ 63 if (content.elementAt(member) instanceof HTMLCText){ 64 reply.append(((HTMLCText) content.elementAt(member)).toString()); 65 } 66 } 67 return reply.toString(); 68 } 69 70 /** 71 * Returns document position of start of the block 72 */ 73 public int startPos() 74 { 75 return this.tagpos; 76 } 77 78 /** 79 * Returns document position of end of the block 80 */ 81 public int endPos() 82 { 83 return this.tagpos; 84 } 85 86 /** 87 * @return head (first) tag of the block 88 */ 89 public HTMLTag headTag() 90 { 91 return ((HTMLTag) content.elementAt(0)); 92 } 93 94 /** 95 * @return name (as a <code>String</code>) of the tag at the head of the block 96 */ 97 public String headTagName() 98 { 99 return ((HTMLTag) content.elementAt(0)).tagName(); 100 } 101 102 /** 103 * @return name of the tail (last) tag of the block as a <code>String</code> 104 */ 105 public String tailTagName() 106 { 107 return (((HTMLTag) content.elementAt(content.size() - 1)).tagName()); 108 } 109 110 111 /** 112 * @return: starting element character position of the item-th item 113 */ 114 private int elementStartPos(int item) 115 { 116 Object element; 117 int pos = -1; 106 118 107 element = this.content.elementAt(item); 108 if (element instanceof HTMLTag) 109 { pos = ((HTMLTag) element).startPos(); 110 } 111 else if (element instanceof HTMLCText) 112 { pos = ((HTMLCText) element).startPos(); 113 } 114 return pos; 115 } 116 117 /** 118 * @return: ending element character position of the item-th item 119 */ 120 private int elementEndPos(int item) 121 { Object element; 122 int pos = -1; 123 124 element = this.content.elementAt(item); 125 if (element instanceof HTMLTag) 126 { pos = ((HTMLTag) element).endPos(); 127 } 128 else if (element instanceof HTMLCText) 129 { pos = ((HTMLCText) element).endPos(); 130 } 131 return pos; 132 } 133 134 /** 135 * @return the HTMLBlock of the indicated subitems 136 */ 137 public HTMLBlock subBlock(int startitem, int enditem) 138 { int i; 139 Object element; 140 HTMLBlock reply; 141 reply = new HTMLBlock(this.elementStartPos(startitem), 142 this.elementEndPos(enditem-1)); 143 // copy all the bits 144 for (i = startitem; i < enditem; i++) 145 { element = this.content.elementAt(i); 146 if (element instanceof HTMLTag) 147 { reply.addTag((HTMLTag) element); 148 } 149 else if (element instanceof HTMLCText) 150 { reply.addText((HTMLCText) element); 151 } 152 /* 153 else if (element instanceof String) 154 { reply.addText((String) element); 155 } 156 */ 157 } 158 return reply; 159 } 160 161 /** 162 * Return the position of the given <code>HTMLBlock</code> within this block 163 * @return the tag index of the child block; or < 0 if not a chile 164 */ 165 public int find(HTMLBlock subblock) 166 { int start, end, at, attag; 167 HTMLTag tag; 168 169 start = 0; 170 end = this.content.size(); 171 while (start != end) 172 { /* compare positions */ 173 at = (start + end) / 2; 174 attag = at; 175 while (attag < end && 176 (content.elementAt(attag) instanceof HTMLTag) == false) 177 { attag ++; 178 } 179 180 if (attag == end) 181 { end = at; 182 continue; 183 } 184 tag = (HTMLTag) content.elementAt(attag); 185 if (tag.startPos() > subblock.startPos()) 186 { end = at; 187 } 188 else if (tag.startPos() < subblock.startPos()) 189 { start = attag + 1; 190 } 191 else 192 { return attag; 193 } 194 } 195 return -1; 196 } 197 198 /** 199 * @return The raw HTML of the block 200 */ 201 public String HTMLString() 202 { int loop; 203 StringBuffer reply; 204 205 reply = new StringBuffer(); 206 for (loop = 0; loop < this.content.size(); loop ++) 207 { /* 208 if (content.elementAt(loop) instanceof String) 209 { reply.append((String) content.elementAt(loop)); 210 } 211 else 212 */ 213 if (content.elementAt(loop) instanceof HTMLCText) 214 { reply.append(((HTMLCText) content.elementAt(loop)).toString()); 215 } 216 else if (content.elementAt(loop) instanceof HTMLTag) 217 { reply.append(((HTMLTag) content.elementAt(loop)).toString()); 218 } 219 } 220 return reply.toString(); 221 } 222 223 /** 224 * @return The text of the block - no HTML tags. If no text is present, any 225 * <code>alt</code> information for <code>img</code> tags will be given 226 * instead 227 */ 228 public String toString() 229 { int member; 230 StringBuffer reply; 231 StringBuffer ireply; 232 233 reply = new StringBuffer(""); 234 ireply = new StringBuffer(""); 235 for (member = 0; member < content.size(); member ++) 236 { /*if (content.elementAt(member) instanceof String) 237 { reply.append((String) content.elementAt(member)); 238 } 239 */ 240 if (content.elementAt(member) instanceof HTMLCText) 241 { reply.append(((HTMLCText) content.elementAt(member)).toString()); 242 } 243 else if (content.elementAt(member) instanceof HTMLTag) 244 { if (((HTMLTag) content.elementAt(member)).tagName().equals("img")) 245 { ireply.append(((HTMLTag) content.elementAt(member)).idValue("alt")); 246 } 247 } 248 } 249 reply = HTMLCText.cleanString(reply); 250 ireply = HTMLCText.cleanString(ireply); 251 // reply = reply.trim(); 252 if (reply == null || reply.length() == 0) 253 { return ireply.toString(); 254 } 255 return reply.toString(); 256 } 257 258 public HTMLObject elementAt(int at) 259 { return (HTMLObject) this.content.elementAt(at); 260 } 261 262 /** 263 * @return the number of elements in the block 264 */ 265 public int size() 266 { return this.content.size(); 267 } 268 269 /** 270 * @return an enumeration of all elements in the block 271 */ 272 public Enumeration elements() 273 { Enumeration enumer; 119 element = this.content.elementAt(item); 120 if (element instanceof HTMLTag){ 121 pos = ((HTMLTag) element).startPos(); 122 } 123 else if (element instanceof HTMLCText){ 124 pos = ((HTMLCText) element).startPos(); 125 } 126 return pos; 127 } 128 129 /** 130 * @return: ending element character position of the item-th item 131 */ 132 private int elementEndPos(int item) 133 { 134 Object element; 135 int pos = -1; 136 137 element = this.content.elementAt(item); 138 if (element instanceof HTMLTag){ 139 pos = ((HTMLTag) element).endPos(); 140 } 141 else if (element instanceof HTMLCText){ 142 pos = ((HTMLCText) element).endPos(); 143 } 144 return pos; 145 } 146 147 /** 148 * @return the HTMLBlock of the indicated subitems 149 */ 150 public HTMLBlock subBlock(int startitem, int enditem) 151 { 152 int i; 153 Object element; 154 HTMLBlock reply; 155 reply = new HTMLBlock(this.elementStartPos(startitem), 156 this.elementEndPos(enditem-1)); 157 // copy all the bits 158 for (i = startitem; i < enditem; i++){ 159 element = this.content.elementAt(i); 160 if (element instanceof HTMLTag){ 161 reply.addTag((HTMLTag) element); 162 } 163 else if (element instanceof HTMLCText){ 164 reply.addText((HTMLCText) element); 165 } 166 /* 167 else if (element instanceof String) 168 { reply.addText((String) element); 169 } 170 */ 171 } 172 return reply; 173 } 174 175 /** 176 * Return the position of the given <code>HTMLBlock</code> within this block 177 * @return the tag index of the child block; or < 0 if not a chile 178 */ 179 public int find(HTMLBlock subblock) 180 { 181 int start, end, at, attag; 182 HTMLTag tag; 183 184 start = 0; 185 end = this.content.size(); 186 while (start != end){ 187 /* compare positions */ 188 at = (start + end) / 2; 189 attag = at; 190 while (attag < end && 191 (content.elementAt(attag) instanceof HTMLTag) == false){ 192 attag ++; 193 } 194 195 if (attag == end){ 196 end = at; 197 continue; 198 } 199 tag = (HTMLTag) content.elementAt(attag); 200 if (tag.startPos() > subblock.startPos()){ 201 end = at; 202 } 203 else if (tag.startPos() < subblock.startPos()){ 204 start = attag + 1; 205 } 206 else { 207 return attag; 208 } 209 } 210 return -1; 211 } 212 213 /** 214 * @return The raw HTML of the block 215 */ 216 public String HTMLString() 217 { 218 int loop; 219 StringBuffer reply; 220 221 reply = new StringBuffer(); 222 for (loop = 0; loop < this.content.size(); loop ++){ 223 /* 224 if (content.elementAt(loop) instanceof String) 225 { reply.append((String) content.elementAt(loop)); 226 } 227 else 228 */ 229 if (content.elementAt(loop) instanceof HTMLCText){ 230 reply.append(((HTMLCText) content.elementAt(loop)).toString()); 231 } 232 else if (content.elementAt(loop) instanceof HTMLTag){ 233 reply.append(((HTMLTag) content.elementAt(loop)).toString()); 234 } 235 } 236 return reply.toString(); 237 } 238 239 /** 240 * @return The text of the block - no HTML tags. If no text is present, any 241 * <code>alt</code> information for <code>img</code> tags will be given 242 * instead 243 */ 244 public String toString() 245 { 246 int member; 247 StringBuffer reply; 248 StringBuffer ireply; 249 250 reply = new StringBuffer(""); 251 ireply = new StringBuffer(""); 252 for (member = 0; member < content.size(); member ++){ 253 /*if (content.elementAt(member) instanceof String) 254 { reply.append((String) content.elementAt(member)); 255 } 256 */ 257 if (content.elementAt(member) instanceof HTMLCText){ 258 reply.append(((HTMLCText) content.elementAt(member)).toString()); 259 } 260 else if (content.elementAt(member) instanceof HTMLTag){ 261 if (((HTMLTag) content.elementAt(member)).tagName().equals("img")){ 262 ireply.append(((HTMLTag) content.elementAt(member)).idValue("alt")); 263 } 264 } 265 } 266 reply = HTMLCText.cleanString(reply); 267 ireply = HTMLCText.cleanString(ireply); 268 // reply = reply.trim(); 269 if (reply == null || reply.length() == 0){ 270 return ireply.toString(); 271 } 272 return reply.toString(); 273 } 274 275 public HTMLObject elementAt(int at) 276 { 277 return (HTMLObject) this.content.elementAt(at); 278 } 279 280 /** 281 * @return the number of elements in the block 282 */ 283 public int size() 284 { 285 return this.content.size(); 286 } 287 288 /** 289 * @return an enumeration of all elements in the block 290 */ 291 public Enumeration elements() 292 { 293 Enumeration enumer; 274 294 275 enumer = new HTMLBlockEnumerator(this); 276 return enumer; 277 } 278 279 /** 280 * @return an enumeration of all tags in the block 281 */ 282 public Enumeration tagElements() 283 { Enumeration enumer; 284 285 enumer = new HTMLBlockTagEnumerator(this); 286 return enumer; 287 } 288 289 /** 290 * Finalization method 291 */ 292 protected void finalize() throws Throwable 293 { this.content = null; 294 super.finalize(); 295 } 295 enumer = new HTMLBlockEnumerator(this); 296 return enumer; 297 } 298 299 /** 300 * @return an enumeration of all tags in the block 301 */ 302 public Enumeration tagElements() 303 { 304 Enumeration enumer; 305 306 enumer = new HTMLBlockTagEnumerator(this); 307 return enumer; 308 } 309 310 /** 311 * Finalization method 312 */ 313 protected void finalize() throws Throwable 314 { 315 this.content = null; 316 super.finalize(); 317 } 296 318 } 297 319 298 320 final class HTMLBlockEnumerator implements Enumeration 299 { private HTMLBlock block; 300 int member; 301 302 public HTMLBlockEnumerator(HTMLBlock block) 303 { this.block = block; 304 this.member = 0; 305 } 306 307 public Object nextElement() 308 { Object element; 309 310 element = this.block.content.elementAt(member); 311 member ++; 312 return element; 313 } 314 315 public boolean hasMoreElements() 316 { if (block == null || this.member == this.block.content.size()) 317 { return false; 318 } 319 return true; 320 } 321 { 322 private HTMLBlock block; 323 int member; 324 325 public HTMLBlockEnumerator(HTMLBlock block) 326 { 327 this.block = block; 328 this.member = 0; 329 } 330 331 public Object nextElement() 332 { 333 Object element; 334 335 element = this.block.content.elementAt(member); 336 member ++; 337 return element; 338 } 339 340 public boolean hasMoreElements() 341 { 342 if (block == null || this.member == this.block.content.size()){ 343 return false; 344 } 345 return true; 346 } 321 347 } 322 348 323 349 final class HTMLBlockTagEnumerator implements Enumeration 324 { private HTMLBlock block; 325 int member; 326 327 public HTMLBlockTagEnumerator(HTMLBlock block) 328 { this.block = block; 329 this.member = 0; 330 while ( this.member < this.block.content.size() && 331 this.block.content.elementAt(this.member) instanceof HTMLTag == false) 332 { this.member ++; 333 } 334 } 335 336 public Object nextElement() 337 { Object element; 338 339 element = this.block.content.elementAt(member); 340 do 341 { member ++; 342 } while (this.block.content.elementAt(member) instanceof HTMLTag == false); 343 return element; 344 } 345 346 public boolean hasMoreElements() 347 { if (block == null || this.member == this.block.content.size()) 348 { return false; 349 } 350 return true; 351 } 350 { 351 private HTMLBlock block; 352 int member; 353 354 public HTMLBlockTagEnumerator(HTMLBlock block) 355 { 356 this.block = block; 357 this.member = 0; 358 while ( this.member < this.block.content.size() && 359 this.block.content.elementAt(this.member) instanceof HTMLTag == false){ 360 this.member ++; 361 } 362 } 363 364 public Object nextElement() 365 { 366 Object element; 367 368 element = this.block.content.elementAt(member); 369 do { 370 member ++; 371 } while (this.block.content.elementAt(member) instanceof HTMLTag == false); 372 return element; 373 } 374 375 public boolean hasMoreElements() 376 { 377 if (block == null || this.member == this.block.content.size()){ 378 return false; 379 } 380 return true; 381 } 352 382 } 353 383 -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLDoc.java
r6285 r8709 11 11 12 12 public class HTMLDoc 13 { String content; 14 String title; 15 URL url; 16 int pos; 17 int status; 18 HTMLBlockList blocklist; 19 List urls_out; 20 List urls_in; 21 Object note; 22 HTMLLoader loader; 23 HTMLBlock coded; 24 25 /** 26 * Constructor which loads a given document; the ref indicates from where! 27 */ 28 public HTMLDoc(URL url, int ref) 29 { 30 URLConnection connect; 31 int length, i, c; 32 byte inbuffer[]; 33 InputStream in; 34 35 this.url = url; 36 this.pos = 0; 37 this.blocklist = null; 38 this.title = null; 39 this.urls_out = null; 40 this.urls_in = null; 41 this.note = null; 42 this.status = 0; 43 44 if (ref >= 0) 45 { int attempts; 46 47 attempts = 0; 48 loader = new HTMLLoader(url); 49 loader.load(); 50 if (ref == 0) 51 { while(this.ready() == false) 52 { try 53 { 54 Thread.sleep(100); 55 } 56 catch (InterruptedException ex) 57 { 58 } 59 } 60 } 13 { 14 String content; 15 String title; 16 URL url; 17 int pos; 18 int status; 19 HTMLBlockList blocklist; 20 List urls_out; 21 List urls_in; 22 Object note; 23 HTMLLoader loader; 24 HTMLBlock coded; 25 26 /** 27 * Constructor which loads a given document; the ref indicates from where! 28 */ 29 public HTMLDoc(URL url, int ref) 30 { 31 URLConnection connect; 32 int length, i, c; 33 byte inbuffer[]; 34 InputStream in; 35 36 this.url = url; 37 this.pos = 0; 38 this.blocklist = null; 39 this.title = null; 40 this.urls_out = null; 41 this.urls_in = null; 42 this.note = null; 43 this.status = 0; 44 45 if (ref >= 0){ 46 int attempts; 47 48 attempts = 0; 49 loader = new HTMLLoader(url); 50 loader.load(); 51 if (ref == 0){ 52 while(this.ready() == false){ 53 try { 54 Thread.sleep(100); 55 } 56 catch (InterruptedException ex){ 57 58 } 61 59 } 62 else 63 { this.content = ""; 60 } 61 } 62 else { 63 this.content = ""; 64 } 65 } 66 67 public HTMLDoc(URL url) 68 { 69 this(url, 0); 70 } 71 72 /** 73 * @return <code>true</code> when a document has finished loading. 74 */ 75 public synchronized boolean ready() 76 { 77 if (this.loader == null){ 78 return true; 79 } 80 81 boolean loaded = loader.loaded(); 82 83 if (loaded == true){ 84 // reload if necessary 85 if (this.loader.error == true){ 86 if (this.status < 3 && this.loader.notFound == false){ 87 this.status ++; 88 this.loader = new HTMLLoader(this.url); 89 loader.load(); 90 return false; 64 91 } 65 } 66 67 public HTMLDoc(URL url) 68 { this(url, 0); 69 } 70 71 /** 72 * @return <code>true</code> when a document has finished loading. 73 */ 74 public synchronized boolean ready() 75 { if (this.loader == null) 76 { return true; 92 else { 93 this.status = 4; 94 this.content = null; 95 this.loader = null; 96 return true; 77 97 } 78 79 boolean loaded = loader.loaded(); 80 81 if (loaded == true) 82 { // reload if necessary 83 if (this.loader.error == true) 84 { if (this.status < 3 && this.loader.notFound == false) 85 { this.status ++; 86 this.loader = new HTMLLoader(this.url); 87 loader.load(); 88 return false; 89 } 90 else 91 { this.status = 4; 92 this.content = null; 93 this.loader = null; 94 return true; 95 } 96 } 97 this.content = loader.HTML(); 98 this.loader = null; 98 } 99 this.content = loader.HTML(); 100 this.loader = null; 101 } 102 return loaded; 103 } 104 105 /** 106 * Create a document from a file 107 */ 108 public HTMLDoc(URL url, String file) 109 { 110 FileInputStream _in; 111 byte[] data = new byte[128]; 112 int databytes; 113 StringBuffer content; 114 115 try { 116 _in = new FileInputStream(file); 117 if (_in != null){ 118 content = new StringBuffer(); 119 do{ 120 databytes = _in.read(data); 121 if (databytes > 0){ 122 content.append(new String(data, 0, databytes)); 123 } 124 } while (databytes >= 0); 125 this.content = content.toString(); 126 } 127 } 128 catch (IOException ex){ 129 this.content = null; 130 System.out.println(ex); 131 } 132 this.url = url; 133 this.pos = 0; 134 this.status = 0; 135 this.note = null; 136 this.blocklist = null; 137 this.title = null; 138 this.urls_out = null; 139 this.urls_in = null; 140 } 141 142 /** 143 * Create a url-only document: the document body is not stored 144 */ 145 public HTMLDoc(String url) 146 { 147 if (url != null){ 148 try{ 149 this.url = new URL(url); 150 } 151 catch (MalformedURLException ex){ 152 this.url = null; 153 } 154 } 155 else { 156 this.url = null; 157 } 158 this.pos = 0; 159 this.status = 0; 160 this.note = null; 161 this.content = null; 162 this.blocklist = null; 163 this.title = null; 164 this.urls_out = null; 165 this.urls_in = null; 166 } 167 168 static public HTMLDoc fromString(String url, String string) 169 { 170 HTMLDoc reply; 171 172 reply = new HTMLDoc(url); 173 reply.content = string; 174 return reply; 175 } 176 177 static public boolean validExtension(String url) 178 { 179 String extension; 180 181 // get file extension 182 extension = URLString.extension(url); 183 if (extension != null){ 184 extension = extension.toLowerCase(); 185 } 186 187 if (extension == null || 188 extension.indexOf("htm") >= 0 || 189 extension.indexOf("asp") >= 0){ 190 return true; 191 } 192 return false; 193 } 194 195 static boolean validExtension(URL url) 196 { 197 return validExtension(url.toString()); 198 } 199 200 /** 201 * Uncache a single document 202 */ 203 public static HTMLDoc uncacheDoc(String filebase, String url) 204 { 205 String pathname; 206 String filename; 207 HTMLDoc doc; 208 209 pathname = URLString.pathName(url); 210 if (pathname == null){ 211 return null; 212 } 213 214 filename = filebase + pathname.replace('/', File.pathSeparatorChar); 215 doc = new HTMLDoc(URLString.toURL(url), filename); 216 217 return doc; 218 } 219 220 /** 221 * @return URL (ie. Java URL object) of the document 222 */ 223 public URL getUrl() 224 { 225 return this.url; 226 } 227 228 // Report url to AppletContext 229 public void show(AppletContext ac) 230 { 231 URL url; 232 233 url = this.getUrl(); 234 if (url != null){ 235 ac.showDocument(url); 236 } 237 } 238 239 // Get a HTMLBlock list of the wanted text in an HTMLdoc 240 public void getSelectContent(int level) 241 { 242 HTMLBlockList list; 243 244 list = new HTMLBlockList(this, level); 245 this.blocklist = list; 246 } 247 248 /** 249 * return the selected content 250 */ 251 public HTMLBlockList selectContent() 252 { 253 return this.blocklist; 254 } 255 256 /** 257 * get header blocks as selected content 258 */ 259 public HTMLBlockList headerBlocks() 260 { 261 if (this.blocklist == null){ 262 this.getSelectContent(6); 263 } 264 return this.blocklist; 265 } 266 267 /** 268 * return a tagged-up select with an <A NAME=> at the beginning of each Hx 269 */ 270 public String taggedSelectContent() 271 { 272 int heading, offset, pos; 273 String reply; 274 275 if (this.blocklist == null || 276 this.blocklist.size() == 0){ 277 return this.content; 278 } 279 280 reply = ""; 281 offset = 0; 282 for (heading = 0; heading < this.blocklist.size(); heading ++){ 283 reply = reply + 284 this.content.substring(offset, this.blocklist.tagPos(heading)) + 285 "<A NAME=\"_L"+heading+"\"></A>"; 286 offset = this.blocklist.tagPos(heading); 287 } 288 reply = reply + this.content.substring(offset, this.content.length()); 289 return reply; 290 } 291 292 /** 293 * Return enumeration of headings in document 294 */ 295 public Enumeration headings() 296 { 297 if (this.blocklist == null){ 298 this.getSelectContent(6); 299 } 300 return this.blocklist.elements(); 301 } 302 303 /** 304 * @return HTMLBlock of the child text underneath a heading 305 */ 306 public HTMLBlock headingChildText(int headref) 307 { 308 HTMLBlock pageblock; 309 HTMLBlock headblock; 310 HTMLBlock childblock; 311 int blockstart, blockend; 312 313 pageblock = this.getCodedContent(); // get all the coded bits of the page 314 headblock = this.headerBlocks().tagBlock(headref); 315 316 blockstart = pageblock.find(headblock); 317 blockstart += headblock.size(); 318 if (headref == this.headerBlocks().size() - 1) { 319 blockend = pageblock.size(); 320 } 321 else { 322 blockend = pageblock.find(this.headerBlocks().tagBlock(headref+1)); 323 } 324 childblock = pageblock.subBlock(blockstart, blockend); 325 326 return childblock; 327 } 328 329 /** 330 * Add incoming url to list of incoming references 331 */ 332 public void addUrlIn(String url) 333 { 334 if (this.urls_in == null){ 335 this.urls_in = new ArrayList(); 336 } 337 this.urls_in.add(url); 338 } 339 340 /** 341 * @return a String of all incoming urls 342 */ 343 public String urlsInString() 344 { 345 if (this.urls_in != null){ 346 return this.urls_in.toString(); 347 } 348 return ""; 349 } 350 351 /** 352 * @return an enumeration of all incoming urls as Strings 353 */ 354 public Iterator urlsIn() 355 { 356 if (this.urls_in == null){ 357 return null; 358 } 359 return this.urls_in.iterator(); 360 } 361 362 /** 363 * @return the <code>StringVector</code> of all incoming urls 364 */ 365 public List urlsInList() 366 { 367 return this.urls_in; 368 } 369 370 /** 371 * @return <code>true</code> if the given url is linked to by this 372 * document 373 */ 374 public boolean linkedTo(URL url) 375 { 376 if (this.urls_out != null && 377 this.urls_out.indexOf(url.toString()) >= 0){ 378 return true; 379 } 380 return false; 381 } 382 383 /** 384 * See above: returns <code>true</code> if the document is linked to 385 * by this document. 386 */ 387 public boolean linkedTo(HTMLDoc doc) 388 { 389 return linkedTo(doc.getUrl()); 390 } 391 392 /** 393 * Add outgoing url to list of outgoing references 394 */ 395 public void addUrlOut(String url) 396 { 397 if (this.urls_out == null){ 398 this.urls_out = new ArrayList(); 399 } 400 this.urls_out.add(url); 401 } 402 403 /** 404 * @return a String of all outgoing urls 405 */ 406 public String urlsOutString() 407 { 408 if (this.urls_out != null){ 409 return this.urls_out.toString(); 410 } 411 return ""; 412 } 413 414 /** 415 * @return the <code>StringVector</code> of all outgoing urls 416 */ 417 public List urlsOutList() 418 { 419 return this.urls_out; 420 } 421 422 /** 423 * Return an enumeration of outbound urls 424 */ 425 public Iterator urlsOut() 426 { 427 if (this.urls_out == null){ 428 return null; 429 } 430 return this.urls_out.iterator(); 431 } 432 433 private static int urlListPos(List urlList, String url) 434 { 435 for (int i = 0; i < urlList.size(); i ++){ 436 if (((String) urlList.get(i)).equals(url)){ 437 return i; 438 } 439 } 440 return -1; 441 } 442 443 // Rename internally the content of a list 444 private void __renameUrl(List urlList, String oldUrl, String newUrl) 445 { 446 if (urlList != null){ 447 int oldPos = urlListPos(urlList, oldUrl); 448 449 if (oldPos >= 0){ 450 if (urlListPos(urlList, newUrl) >= 0){ 451 // delete oldUrl: now duplicate 452 urlList.remove(oldPos); 99 453 } 100 return loaded; 101 } 102 103 /** 104 * Create a document from a file 105 */ 106 public HTMLDoc(URL url, String file) 107 { FileInputStream _in; 108 byte[] data = new byte[128]; 109 int databytes; 110 StringBuffer content; 111 112 try 113 { _in = new FileInputStream(file); 114 if (_in != null) 115 { content = new StringBuffer(); 116 117 do 118 { databytes = _in.read(data); 119 if (databytes > 0) 120 { content.append(new String(data, 0, databytes)); 121 } 122 } while (databytes >= 0); 123 this.content = content.toString(); 124 } 454 else { 455 // rename oldUrl 456 urlList.set(oldPos, newUrl); 125 457 } 126 catch (IOException ex) 127 { this.content = null; 128 System.out.println(ex); 458 } 459 } 460 } 461 462 /** 463 * Rename inbound/outbound url references 464 */ 465 public void renameUrl(String oldUrl, String newUrl) 466 { 467 this.__renameUrl(this.urls_in, oldUrl, newUrl); 468 this.__renameUrl(this.urls_out, oldUrl, newUrl); 469 } 470 471 /** 472 * Save document 473 */ 474 public boolean save(String filename, boolean select) 475 { 476 FileOutputStream fout; 477 PrintWriter pout; 478 479 String flattened = URLString.flattenedUrl(filename); 480 481 try{ 482 HTMLDocAnchorList anchorList = new HTMLDocAnchorList(this); 483 484 anchorList.flattenLinks(); 485 486 fout = new FileOutputStream(flattened); 487 pout = new PrintWriter(fout); 488 if (select == true) { 489 pout.print(this.taggedSelectContent()); 490 } 491 else { 492 pout.print(this.content); 493 } 494 pout.close(); 495 fout.close(); 496 } 497 catch (IOException ex) { 498 System.out.println(ex); 499 return false; 500 } 501 return true; 502 } 503 504 /** 505 * Save document content 506 */ 507 public boolean save(String base) 508 { 509 return this.save(base, false); 510 } 511 512 /** 513 * Save selected content 514 */ 515 public boolean saveSelect(String base) 516 { 517 return this.save(base, true); 518 } 519 520 /** 521 * Get title of page 522 */ 523 public void getTitle() 524 { 525 HTMLDocTitle doctitle; 526 527 if (this.content != null){ 528 doctitle = new HTMLDocTitle(this); 529 this.title = doctitle.Title(); 530 } 531 } 532 533 /** 534 * @return: title of page 535 */ 536 public String titleString() 537 { 538 if (this.title == null){ 539 this.getTitle(); 540 } 541 return this.title; 542 } 543 544 /** 545 * @return a <code>String</code> of the url of the page 546 */ 547 public String urlString() 548 { 549 return this.url.toString(); 550 } 551 552 /** 553 * Replace the HTML text of the page with the provided HTML string 554 * content. This is an extremely dangerous call to use, as dependent 555 * objects such as tag objects will be immediately rendered inaccurate 556 * in terms of pointers held to the original text. 557 */ 558 protected void setContent(String newContent) 559 { 560 this.content = newContent; 561 } 562 563 /** 564 * @return a <code>String</code> of the HTML text of the page 565 */ 566 public String getContent() 567 { 568 return this.content; 569 } 570 571 /** 572 * @return a <code>HTMLBlock</code> of the page content 573 */ 574 public HTMLBlock getCodedContent() 575 { 576 // if we haven't already got it, code the bugger up right now! 577 if (this.coded == null){ 578 HTMLParser parser; 579 String next; 580 HTMLBlock coded; 581 HTMLTag tag; 582 HTMLCText text; 583 584 if (this.content == null){ 585 return null; 586 } 587 588 parser = new HTMLParser(this); 589 coded = new HTMLBlock(0, this.content.length()); 590 591 parser.startParse(); 592 next = parser.fullParse(); 593 while (next != null){ 594 if (HTMLTag.isTag(next)){ 595 tag = new HTMLTag(next, parser.lastParse(), parser.atParse()); 596 coded.addTag(tag); 129 597 } 130 this.url = url; 131 this.pos = 0; 132 this.status = 0; 133 this.note = null; 134 this.blocklist = null; 135 this.title = null; 136 this.urls_out = null; 137 this.urls_in = null; 138 } 139 140 /** 141 * Create a url-only document: the document body is not stored 142 */ 143 public HTMLDoc(String url) 144 { if (url != null) 145 { try 146 { this.url = new URL(url); 147 } 148 catch (MalformedURLException ex) 149 { 150 this.url = null; 151 } 598 else { 599 text = new HTMLCText(next, parser.lastParse(), parser.atParse()); 600 coded.addText(text); 152 601 } 153 else 154 { this.url = null; 155 } 156 this.pos = 0; 157 this.status = 0; 158 this.note = null; 159 this.content = null; 160 this.blocklist = null; 161 this.title = null; 162 this.urls_out = null; 163 this.urls_in = null; 164 } 165 166 static public HTMLDoc fromString(String url, String string) 167 { HTMLDoc reply; 168 169 reply = new HTMLDoc(url); 170 reply.content = string; 171 return reply; 172 } 173 174 static public boolean validExtension(String url) 175 { String extension; 176 177 // get file extension 178 extension = URLString.extension(url); 179 if (extension != null) 180 { extension = extension.toLowerCase(); 181 } 182 183 if (extension == null || 184 extension.indexOf("htm") >= 0 || 185 extension.indexOf("asp") >= 0) 186 { return true; 187 } 188 return false; 189 } 190 191 static boolean validExtension(URL url) 192 { return validExtension(url.toString()); 193 } 194 195 /** 196 * Uncache a single document 197 */ 198 public static HTMLDoc uncacheDoc(String filebase, String url) 199 { String pathname; 200 String filename; 201 HTMLDoc doc; 202 203 pathname = URLString.pathName(url); 204 if (pathname == null) 205 { return null; 206 } 207 208 filename = filebase + pathname.replace('/', File.pathSeparatorChar); 209 doc = new HTMLDoc(URLString.toURL(url), filename); 210 211 return doc; 212 } 213 214 /** 215 * @return URL (ie. Java URL object) of the document 216 */ 217 public URL getUrl() 218 { return this.url; 219 } 220 221 // Report url to AppletContext 222 public void show(AppletContext ac) 223 { URL url; 224 225 url = this.getUrl(); 226 if (url != null) 227 { ac.showDocument(url); 228 } 229 } 230 231 // Get a HTMLBlock list of the wanted text in an HTMLdoc 232 public void getSelectContent(int level) 233 { HTMLBlockList list; 234 235 list = new HTMLBlockList(this, level); 236 this.blocklist = list; 237 } 238 239 /** 240 * return the selected content 241 */ 242 public HTMLBlockList selectContent() 243 { return this.blocklist; 244 } 245 246 /** 247 * get header blocks as selected content 248 */ 249 public HTMLBlockList headerBlocks() 250 { if (this.blocklist == null) 251 { this.getSelectContent(6); 252 } 253 return this.blocklist; 254 } 255 256 /** 257 * return a tagged-up select with an <A NAME=> at the beginning of each Hx 258 */ 259 public String taggedSelectContent() 260 { int heading, offset, pos; 261 String reply; 262 263 if (this.blocklist == null || 264 this.blocklist.size() == 0) 265 { return this.content; 266 } 267 268 reply = ""; 269 offset = 0; 270 for (heading = 0; heading < this.blocklist.size(); heading ++) 271 { reply = reply + 272 this.content.substring(offset, this.blocklist.tagPos(heading)) + 273 "<A NAME=\"_L"+heading+"\"></A>"; 274 offset = this.blocklist.tagPos(heading); 275 } 276 reply = reply + this.content.substring(offset, this.content.length()); 277 return reply; 278 } 279 280 /** 281 * Return enumeration of headings in document 282 */ 283 public Enumeration headings() 284 { if (this.blocklist == null) 285 { this.getSelectContent(6); 286 } 287 return this.blocklist.elements(); 288 } 289 290 /** 291 * @return HTMLBlock of the child text underneath a heading 292 */ 293 public HTMLBlock headingChildText(int headref) 294 { HTMLBlock pageblock; 295 HTMLBlock headblock; 296 HTMLBlock childblock; 297 int blockstart, blockend; 298 299 pageblock = this.getCodedContent(); // get all the coded bits of the page 300 headblock = this.headerBlocks().tagBlock(headref); 301 302 blockstart = pageblock.find(headblock); 303 blockstart += headblock.size(); 304 if (headref == this.headerBlocks().size() - 1) 305 { blockend = pageblock.size(); 306 } 307 else 308 { blockend = pageblock.find(this.headerBlocks().tagBlock(headref+1)); 309 } 310 childblock = pageblock.subBlock(blockstart, blockend); 311 312 return childblock; 313 } 314 315 /** 316 * Add incoming url to list of incoming references 317 */ 318 public void addUrlIn(String url) 319 { if (this.urls_in == null) 320 { this.urls_in = new ArrayList(); 321 } 322 this.urls_in.add(url); 323 } 324 325 /** 326 * @return a String of all incoming urls 327 */ 328 public String urlsInString() 329 { if (this.urls_in != null) 330 { return this.urls_in.toString(); 331 } 332 return ""; 333 } 334 335 /** 336 * @return an enumeration of all incoming urls as Strings 337 */ 338 public Iterator urlsIn() 339 { if (this.urls_in == null) 340 { return null; 341 } 342 return this.urls_in.iterator(); 343 } 344 345 /** 346 * @return the <code>StringVector</code> of all incoming urls 347 */ 348 public List urlsInList() 349 { return this.urls_in; 350 } 351 352 /** 353 * @return <code>true</code> if the given url is linked to by this 354 * document 355 */ 356 public boolean linkedTo(URL url) 357 { if (this.urls_out != null && 358 this.urls_out.indexOf(url.toString()) >= 0) 359 { return true; 360 } 361 return false; 362 } 363 364 /** 365 * See above: returns <code>true</code> if the document is linked to 366 * by this document. 367 */ 368 public boolean linkedTo(HTMLDoc doc) 369 { return linkedTo(doc.getUrl()); 370 } 371 372 /** 373 * Add outgoing url to list of outgoing references 374 */ 375 public void addUrlOut(String url) 376 { if (this.urls_out == null) 377 { this.urls_out = new ArrayList(); 378 } 379 this.urls_out.add(url); 380 } 381 382 /** 383 * @return a String of all outgoing urls 384 */ 385 public String urlsOutString() 386 { if (this.urls_out != null) 387 { return this.urls_out.toString(); 388 } 389 return ""; 390 } 391 392 /** 393 * @return the <code>StringVector</code> of all outgoing urls 394 */ 395 public List urlsOutList() 396 { return this.urls_out; 397 } 398 399 /** 400 * Return an enumeration of outbound urls 401 */ 402 public Iterator urlsOut() 403 { if (this.urls_out == null) 404 { return null; 405 } 406 return this.urls_out.iterator(); 407 } 408 409 private static int urlListPos(List urlList, String url) 410 { for (int i = 0; i < urlList.size(); i ++) 411 { if (((String) urlList.get(i)).equals(url)) 412 { return i; 413 } 414 } 415 return -1; 416 } 417 418 // Rename internally the content of a list 419 private void __renameUrl(List urlList, String oldUrl, String newUrl) 420 { if (urlList != null) 421 { int oldPos = urlListPos(urlList, oldUrl); 422 423 if (oldPos >= 0) 424 { if (urlListPos(urlList, newUrl) >= 0) 425 { // delete oldUrl: now duplicate 426 urlList.remove(oldPos); 427 } 428 else 429 { // rename oldUrl 430 urlList.set(oldPos, newUrl); 431 } 432 } 433 } 434 } 435 436 /** 437 * Rename inbound/outbound url references 438 */ 439 public void renameUrl(String oldUrl, String newUrl) 440 { this.__renameUrl(this.urls_in, oldUrl, newUrl); 441 this.__renameUrl(this.urls_out, oldUrl, newUrl); 442 } 443 444 /** 445 * Save document 446 */ 447 public boolean save(String filename, boolean select) 448 { FileOutputStream fout; 449 PrintWriter pout; 450 451 String flattened = URLString.flattenedUrl(filename); 452 453 try 454 { HTMLDocAnchorList anchorList = new HTMLDocAnchorList(this); 455 456 anchorList.flattenLinks(); 457 458 fout = new FileOutputStream(flattened); 459 pout = new PrintWriter(fout); 460 if (select == true) 461 { pout.print(this.taggedSelectContent()); 462 } 463 else 464 { pout.print(this.content); 465 } 466 pout.close(); 467 fout.close(); 468 } 469 catch (IOException ex) 470 { System.out.println(ex); 471 return false; 472 } 473 return true; 474 } 475 476 /** 477 * Save document content 478 */ 479 public boolean save(String base) 480 { return this.save(base, false); 481 } 482 483 /** 484 * Save selected content 485 */ 486 public boolean saveSelect(String base) 487 { return this.save(base, true); 488 } 489 490 /** 491 * Get title of page 492 */ 493 public void getTitle() 494 { HTMLDocTitle doctitle; 495 496 if (this.content != null) 497 { doctitle = new HTMLDocTitle(this); 498 this.title = doctitle.Title(); 499 } 500 } 501 502 /** 503 * @return: title of page 504 */ 505 public String titleString() 506 { if (this.title == null) 507 { this.getTitle(); 508 } 509 return this.title; 510 } 511 512 /** 513 * @return a <code>String</code> of the url of the page 514 */ 515 public String urlString() 516 { return this.url.toString(); 517 } 518 519 /** 520 * Replace the HTML text of the page with the provided HTML string 521 * content. This is an extremely dangerous call to use, as dependent 522 * objects such as tag objects will be immediately rendered inaccurate 523 * in terms of pointers held to the original text. 524 */ 525 protected void setContent(String newContent) 526 { this.content = newContent; 527 } 528 529 /** 530 * @return a <code>String</code> of the HTML text of the page 531 */ 532 public String getContent() 533 { return this.content; 534 } 535 536 /** 537 * @return a <code>HTMLBlock</code> of the page content 538 */ 539 public HTMLBlock getCodedContent() 540 { // if we haven't already got it, code the bugger up right now! 541 if (this.coded == null) 542 { HTMLParser parser; 543 String next; 544 HTMLBlock coded; 545 HTMLTag tag; 546 HTMLCText text; 547 548 if (this.content == null) 549 { return null; 550 } 551 552 parser = new HTMLParser(this); 553 coded = new HTMLBlock(0, this.content.length()); 554 555 parser.startParse(); 556 next = parser.fullParse(); 557 while (next != null) 558 { if (HTMLTag.isTag(next)) 559 { tag = new HTMLTag(next, parser.lastParse(), parser.atParse()); 560 coded.addTag(tag); 561 } 562 else 563 { text = new HTMLCText(next, parser.lastParse(), parser.atParse()); 564 coded.addText(text); 565 } 566 next = parser.fullParse(); 567 } 568 this.coded = coded; 569 } 570 return this.coded; 571 } 572 573 /** 574 * @return <code>true</code> if this document is not errored from loading 575 * etc. 576 */ 577 public boolean valid() 578 { return (this.status < 4 && this.content != null && this.content.equals("null") == false); 579 } 580 581 /** 582 * HTMLString: returns content 583 */ 584 public String HTMLString() 585 { return this.content; 586 } 587 588 /** 589 * Return text of document as ascii text; 590 */ 591 /* 592 public String toString() 593 { StringBuffer reply = new StringBuffer(); 594 int parastart = 0; 595 596 return reply.toString(); 597 } 598 */ 599 600 /** 601 * Set current note on this page 602 * @return old note object 603 */ 604 public Object setNote(Object note) 605 { Object oldnote; 606 607 oldnote = this.note; 608 this.note = note; 609 return oldnote; 610 } 611 612 /** 613 * @return current note <code>Object</code> on this page 614 */ 615 public Object getNote() 616 { return this.note; 617 } 602 next = parser.fullParse(); 603 } 604 this.coded = coded; 605 } 606 return this.coded; 607 } 608 609 /** 610 * @return <code>true</code> if this document is not errored from loading 611 * etc. 612 */ 613 public boolean valid() 614 { 615 return (this.status < 4 && this.content != null && this.content.equals("null") == false); 616 } 617 618 /** 619 * HTMLString: returns content 620 */ 621 public String HTMLString() 622 { 623 return this.content; 624 } 625 626 /** 627 * Return text of document as ascii text; 628 */ 629 /* 630 public String toString() 631 { StringBuffer reply = new StringBuffer(); 632 int parastart = 0; 633 634 return reply.toString(); 635 } 636 */ 637 638 /** 639 * Set current note on this page 640 * @return old note object 641 */ 642 public Object setNote(Object note) 643 { 644 Object oldnote; 645 646 oldnote = this.note; 647 this.note = note; 648 return oldnote; 649 } 650 651 /** 652 * @return current note <code>Object</code> on this page 653 */ 654 public Object getNote() 655 { 656 return this.note; 657 } 618 658 }
Note:
See TracChangeset
for help on using the changeset viewer.