[6816] | 1 | package org.nzdl.gsdl.GsdlCollageApplet;
|
---|
| 2 |
|
---|
| 3 | import java.io.*;
|
---|
| 4 | import java.net.*;
|
---|
| 5 | import java.util.*;
|
---|
| 6 |
|
---|
| 7 | /** Examines html pages and extracts all the images and links */
|
---|
| 8 | public class CURL {
|
---|
| 9 |
|
---|
| 10 | private boolean url_valid = true;
|
---|
| 11 | private InputStream input = null;
|
---|
| 12 | private int peek_value = -1;
|
---|
| 13 | private String buffer = "";
|
---|
| 14 | private URL url = null;
|
---|
| 15 | private Vector href_links = null;
|
---|
| 16 | private Vector src_links = null;
|
---|
| 17 | private Vector link_links = null;
|
---|
| 18 | private Vector background_links = null;
|
---|
| 19 |
|
---|
| 20 | /** Starts processing the given url for images and links
|
---|
| 21 | * @param url_str The url to examine */
|
---|
| 22 | public CURL(String url_str) {
|
---|
| 23 | href_links = new Vector();
|
---|
| 24 | src_links = new Vector();
|
---|
| 25 | link_links = new Vector();
|
---|
| 26 | background_links = new Vector();
|
---|
| 27 |
|
---|
| 28 | try {
|
---|
[11418] | 29 | url = new URL(url_str);
|
---|
| 30 | input = url.openStream();
|
---|
| 31 | // int value = 0;
|
---|
| 32 |
|
---|
| 33 | // while ( (value=input.read())!=-1){
|
---|
| 34 | //System.err.print((char)value);
|
---|
| 35 | //}
|
---|
| 36 |
|
---|
| 37 | //url = new URL(url_str);
|
---|
| 38 | //input = url.openStream();
|
---|
| 39 |
|
---|
[6816] | 40 | }
|
---|
| 41 | catch (MalformedURLException e) {
|
---|
[11418] | 42 |
|
---|
[6816] | 43 | url_valid = false;
|
---|
| 44 | }
|
---|
| 45 | catch (IOException e) {
|
---|
[11418] | 46 |
|
---|
[6816] | 47 | url_valid = false;
|
---|
| 48 | }
|
---|
| 49 | }
|
---|
| 50 |
|
---|
| 51 | /** Checks that a valid connection to the url has been made */
|
---|
| 52 | public boolean connected_ok()
|
---|
| 53 | {
|
---|
| 54 | return url_valid;
|
---|
| 55 | }
|
---|
| 56 |
|
---|
| 57 | /** Gets any href links from this url
|
---|
| 58 | * @return Vector of href links */
|
---|
| 59 | public Vector getHrefLinks() {
|
---|
| 60 | return href_links;
|
---|
| 61 | }
|
---|
| 62 | /** Gets any source links from this url
|
---|
| 63 | * @return Vector of source links */
|
---|
| 64 | public Vector getSrcLinks() {
|
---|
| 65 | return src_links;
|
---|
| 66 | }
|
---|
| 67 | /** Gets any other links from this url
|
---|
| 68 | * @return Vector of other links */
|
---|
| 69 | public Vector getLinkLinks() {
|
---|
| 70 | return link_links;
|
---|
| 71 | }
|
---|
| 72 | /** Gets any background links from this url
|
---|
| 73 | * @return Vector of background links */
|
---|
| 74 | public Vector getBackgroundLinks() {
|
---|
| 75 | return background_links;
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | /** Gets the url currently being processed */
|
---|
| 79 | public URL getURL() {
|
---|
| 80 | return url;
|
---|
| 81 | }
|
---|
| 82 | /** Checks that the content of the url is in html */
|
---|
| 83 | public boolean isHTML() {
|
---|
| 84 |
|
---|
[11418] | 85 | String content_type = guessContentType(url.toString());
|
---|
| 86 |
|
---|
[6816] | 87 | if(content_type.startsWith("text/html")) {
|
---|
| 88 | return true;
|
---|
| 89 | }
|
---|
| 90 | return false;
|
---|
| 91 | }
|
---|
| 92 |
|
---|
| 93 | /** Reads a value from the buffer
|
---|
| 94 | * @return Value read if successful and -1 if not */
|
---|
| 95 | public int read() {
|
---|
| 96 | int value = -1;
|
---|
| 97 | if(isHTML()) {
|
---|
| 98 | if(buffer.length() == 0) {
|
---|
| 99 | refill();
|
---|
| 100 | }
|
---|
| 101 | if(buffer.length() != 0) {
|
---|
| 102 | value = getBuffer();
|
---|
| 103 | }
|
---|
| 104 | } else {
|
---|
| 105 | value = getRaw();
|
---|
| 106 | }
|
---|
| 107 | return value;
|
---|
| 108 | }
|
---|
| 109 | /** Reads the entire URL */
|
---|
| 110 | public void readAll() {
|
---|
| 111 | int value;
|
---|
| 112 | while((value = read()) != -1) {
|
---|
| 113 | }
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 | // Gets the head of the buffered buffer.
|
---|
| 117 | private int getBuffer() {
|
---|
| 118 | if(buffer.length() > 0) {
|
---|
| 119 | int value = buffer.charAt(0);
|
---|
| 120 | buffer = buffer.substring(1, buffer.length());
|
---|
| 121 | return value;
|
---|
| 122 | } else {
|
---|
| 123 | System.err.println("Called getRaw on an empty string");
|
---|
| 124 | return -1;
|
---|
| 125 | }
|
---|
| 126 | }
|
---|
| 127 | // Gets the head of the raw buffer.
|
---|
| 128 | private int getRaw() {
|
---|
| 129 | int value = -1;
|
---|
| 130 | if(peek_value != -1) {
|
---|
| 131 | value = peek_value;
|
---|
| 132 | peek_value = -1;
|
---|
[7334] | 133 | }
|
---|
| 134 | else {
|
---|
[6816] | 135 | try {
|
---|
| 136 | value = input.read();
|
---|
| 137 | } catch (Exception e) {
|
---|
| 138 | e.printStackTrace();
|
---|
| 139 | }
|
---|
| 140 | }
|
---|
| 141 | return value;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 | private int peekRaw() {
|
---|
| 145 | if(peek_value == -1) {
|
---|
| 146 | peek_value = getRaw();
|
---|
| 147 | }
|
---|
| 148 | return peek_value;
|
---|
| 149 | }
|
---|
| 150 |
|
---|
| 151 | // Refills the buffered buffer with the next tag or non-tag block
|
---|
| 152 | // The tag is checked for urls. Note a tag is taken to be < .. > or
|
---|
| 153 | // < .. < so comments are supported, but comment blocks are still
|
---|
| 154 | // scanned.
|
---|
| 155 | private void refill() {
|
---|
| 156 | int value = getRaw();
|
---|
| 157 | if(value != -1) {
|
---|
[11418] | 158 |
|
---|
[6816] | 159 | if(value == '<') {
|
---|
| 160 | //System.err.println("Parsing a tag starting " + (char)value);
|
---|
| 161 | // Add opening < to buffer
|
---|
| 162 | setBuffer(value);
|
---|
| 163 | String tag = "";
|
---|
| 164 | value = getRaw();
|
---|
| 165 | while(value != -1 && peekRaw() != '<' && value != '>') {
|
---|
[11418] | 166 | //System.err.print((char)value);
|
---|
[6816] | 167 | tag = tag + (char) value;
|
---|
| 168 | value = getRaw();
|
---|
| 169 | }
|
---|
[11418] | 170 |
|
---|
[6816] | 171 | //System.err.println("Read a " + (char)value);
|
---|
| 172 | //tag = smartLower(tag);
|
---|
[11418] | 173 | //System.err.println("tag "+tag);
|
---|
| 174 | tag = findURL(tag);
|
---|
[6816] | 175 | buffer = buffer + tag;
|
---|
| 176 | // Add closing > to buffer
|
---|
| 177 | setBuffer(value);
|
---|
| 178 | //System.err.println("Finished tag");
|
---|
| 179 | } else {
|
---|
| 180 | //System.err.println("Parsing content");
|
---|
| 181 | //System.err.println("Value = " + value + " = '" + (char)value + "'");
|
---|
| 182 | while(value != -1 && value != '<') {
|
---|
| 183 | //System.err.println("Read a '" + (char)value + "'");
|
---|
| 184 | setBuffer(value);
|
---|
| 185 | value = getRaw();
|
---|
| 186 | }
|
---|
| 187 | // If we've accidently read the '<' push it back in the stream by
|
---|
| 188 | // setting peek_value to value. Since the peek_value will be returned
|
---|
| 189 | // on the next read this has the desired effect.
|
---|
| 190 | if(value == '<') {
|
---|
| 191 | peek_value = value;
|
---|
| 192 | }
|
---|
| 193 | //System.err.println("Read " + buffer);
|
---|
| 194 | //System.err.println("Finished Content");
|
---|
| 195 | }
|
---|
| 196 | }
|
---|
| 197 | }
|
---|
| 198 |
|
---|
| 199 | // Sets the tail of the buffered buffer.
|
---|
| 200 | private void setBuffer(int value) {
|
---|
| 201 | buffer = buffer + (char) value;
|
---|
| 202 | }
|
---|
| 203 |
|
---|
| 204 | private String smartLower(String tag) {
|
---|
| 205 | boolean lower = true;
|
---|
| 206 | String new_tag = "";
|
---|
| 207 | for(int i = 0; i < tag.length(); i++) {
|
---|
| 208 | // Disable case lowering for value tags (bound by "")
|
---|
| 209 | if (tag.charAt(i) == '"') {
|
---|
| 210 | if(lower) {
|
---|
| 211 | lower = false;
|
---|
| 212 | } else {
|
---|
| 213 | lower = true;
|
---|
| 214 | }
|
---|
| 215 | }
|
---|
| 216 | // Lower everything else
|
---|
| 217 | if(lower) {
|
---|
| 218 | new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
|
---|
| 219 | } else {
|
---|
| 220 | new_tag = new_tag + tag.charAt(i);
|
---|
| 221 | }
|
---|
| 222 | }
|
---|
| 223 | return new_tag;
|
---|
| 224 | }
|
---|
| 225 |
|
---|
| 226 | private final static int GROUND = 0;
|
---|
| 227 | private final static int COMMENT = 5;
|
---|
| 228 | private final static int COMMENT_DASH = 6;
|
---|
| 229 | private final static int COMMENT_FINAL = 7;
|
---|
| 230 | private final static int H = 11;
|
---|
| 231 | private final static int HR = 12;
|
---|
| 232 | private final static int HRE = 13;
|
---|
| 233 | private final static int HREF = 14;
|
---|
| 234 | private final static int HREF_EQUAL = 15;
|
---|
| 235 | private final static int HREF_Q = 16;
|
---|
| 236 | private final static int HREF_NQ = 17;
|
---|
| 237 | private final static int HREF_FINAL = 18;
|
---|
| 238 | private final static int S = 23;
|
---|
| 239 | private final static int SR = 24;
|
---|
| 240 | private final static int SRC = 25;
|
---|
| 241 | private final static int SRC_EQUAL = 26;
|
---|
| 242 | private final static int SRC_Q = 27;
|
---|
| 243 | private final static int SRC_NQ = 28;
|
---|
| 244 | private final static int SRC_FINAL = 29;
|
---|
| 245 | private final static int L = 67;
|
---|
| 246 | private final static int LI = 68;
|
---|
| 247 | private final static int LIN = 69;
|
---|
| 248 | private final static int LINK = 70;
|
---|
| 249 | private final static int LINK_QUOTE = 72;
|
---|
| 250 | private final static int LINK_H = 73;
|
---|
| 251 | private final static int LINK_HR = 74;
|
---|
| 252 | private final static int LINK_HRE = 75;
|
---|
| 253 | private final static int LINK_HREF = 76;
|
---|
| 254 | private final static int LINK_EQUAL = 77;
|
---|
| 255 | private final static int LINK_Q = 78;
|
---|
| 256 | private final static int LINK_NQ = 79;
|
---|
| 257 | private final static int LINK_FINAL = 80;
|
---|
| 258 | private final static int B = 85;
|
---|
| 259 | private final static int BA = 86;
|
---|
| 260 | private final static int BAC = 87;
|
---|
| 261 | private final static int BACK = 88;
|
---|
| 262 | private final static int BACKG = 89;
|
---|
| 263 | private final static int BACKGR = 90;
|
---|
| 264 | private final static int BACKGRO = 91;
|
---|
| 265 | private final static int BACKGROU = 92;
|
---|
| 266 | private final static int BACKGROUN = 93;
|
---|
| 267 | private final static int BACKGROUND = 94;
|
---|
| 268 | private final static int BACKGROUND_EQUAL = 95;
|
---|
| 269 | private final static int BACKGROUND_Q = 96;
|
---|
| 270 | private final static int BACKGROUND_NQ = 97;
|
---|
| 271 | private final static int BACKGROUND_FINAL = 98;
|
---|
| 272 | private final static int FINAL = 99;
|
---|
| 273 |
|
---|
| 274 | // Run the finite-state machine on a buffer-load.
|
---|
| 275 | private String findURL(String tail) {
|
---|
| 276 | int state = GROUND;
|
---|
| 277 | String head = "";
|
---|
| 278 | String url_str = "";
|
---|
| 279 | // Sift through the tag for urls
|
---|
| 280 | while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
|
---|
| 281 | char ch = tail.charAt(0);
|
---|
| 282 | String sch = "" + ch;
|
---|
| 283 | sch = sch.toLowerCase();
|
---|
| 284 | char lch = sch.charAt(0);
|
---|
| 285 | tail = tail.substring(1);
|
---|
| 286 | switch (state) {
|
---|
| 287 | // Initial state.
|
---|
| 288 | case GROUND:
|
---|
| 289 | switch (lch) {
|
---|
| 290 | case '!': state = COMMENT; break;
|
---|
| 291 | case 'B': case 'b': state = B; break;
|
---|
| 292 | case 'H': case 'h': state = H; break;
|
---|
| 293 | case 'L': case 'l': state = L; break;
|
---|
| 294 | case 'S': case 's': state = S; break;
|
---|
| 295 | }
|
---|
| 296 | head = head + ch;
|
---|
| 297 | break;
|
---|
| 298 | // A possible comment
|
---|
| 299 | case COMMENT:
|
---|
| 300 | switch(lch) {
|
---|
| 301 | case '-': state = COMMENT_DASH; break;
|
---|
| 302 | default: state = GROUND; break;
|
---|
| 303 | }
|
---|
| 304 | head = head + ch;
|
---|
| 305 | break;
|
---|
| 306 | case COMMENT_DASH:
|
---|
| 307 | switch(lch) {
|
---|
| 308 | case '-': state = COMMENT_FINAL; break; // skip comments
|
---|
| 309 | default: state = GROUND; break;
|
---|
| 310 | }
|
---|
| 311 | head = head + ch;
|
---|
| 312 | break;
|
---|
| 313 | // A possible href
|
---|
| 314 | case H:
|
---|
| 315 | switch(lch) {
|
---|
| 316 | case 'R': case 'r': state = HR; break;
|
---|
| 317 | default: state = GROUND; break;
|
---|
| 318 | }
|
---|
| 319 | head = head + ch;
|
---|
| 320 | break;
|
---|
| 321 | case HR:
|
---|
| 322 | switch(lch) {
|
---|
| 323 | case 'E': case 'e': state = HRE; break;
|
---|
| 324 | default: state = GROUND; break;
|
---|
| 325 | }
|
---|
| 326 | head = head + ch;
|
---|
| 327 | break;
|
---|
| 328 | case HRE:
|
---|
| 329 | switch(lch) {
|
---|
| 330 | case 'F': case 'f': state = HREF; break;
|
---|
| 331 | default: state = GROUND; break;
|
---|
| 332 | }
|
---|
| 333 | head = head + ch;
|
---|
| 334 | break;
|
---|
| 335 | case HREF:
|
---|
| 336 | switch(lch) {
|
---|
| 337 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 338 | case '=': state = HREF_EQUAL; break;
|
---|
| 339 | default: state = GROUND; break;
|
---|
| 340 | }
|
---|
| 341 | head = head + ch;
|
---|
| 342 | break;
|
---|
| 343 | case HREF_EQUAL:
|
---|
| 344 | switch(lch) {
|
---|
| 345 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 346 | case '\'': case '\"': state = HREF_Q; break;
|
---|
| 347 | default: state = HREF_NQ; break;
|
---|
| 348 | }
|
---|
| 349 | if(state == HREF_NQ) {
|
---|
| 350 | url_str = url_str + ch;
|
---|
| 351 | } else {
|
---|
| 352 | head = head + ch;
|
---|
| 353 | }
|
---|
| 354 | break;
|
---|
| 355 | case HREF_NQ:
|
---|
| 356 | switch(lch) {
|
---|
| 357 | case ' ': case '\t': case '\n': case '\r':
|
---|
| 358 | state = HREF_FINAL;
|
---|
| 359 | tail = ch + tail;
|
---|
| 360 | break;
|
---|
| 361 | default: url_str = url_str + ch; break;
|
---|
| 362 | }
|
---|
| 363 | break;
|
---|
| 364 | case HREF_Q:
|
---|
| 365 | switch(lch) {
|
---|
| 366 | case '\'': case '\"':
|
---|
| 367 | state = HREF_FINAL;
|
---|
| 368 | tail = ch + tail;
|
---|
| 369 | break;
|
---|
| 370 | default: url_str = url_str + ch; break;
|
---|
| 371 | }
|
---|
| 372 | break;
|
---|
| 373 | // A possible src
|
---|
| 374 | case S:
|
---|
| 375 | switch(lch) {
|
---|
| 376 | case 'R': case 'r': state = SR; break;
|
---|
| 377 | default: state = GROUND; break;
|
---|
| 378 | }
|
---|
| 379 | head = head + ch;
|
---|
| 380 | break;
|
---|
| 381 | case SR:
|
---|
| 382 | switch(lch) {
|
---|
| 383 | case 'C': case 'c': state = SRC; break;
|
---|
| 384 | default: state = GROUND; break;
|
---|
| 385 | }
|
---|
| 386 | head = head + ch;
|
---|
| 387 | break;
|
---|
| 388 | case SRC:
|
---|
| 389 | switch(lch) {
|
---|
| 390 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 391 | case '=': state = SRC_EQUAL; break;
|
---|
| 392 | default: state = GROUND; break;
|
---|
| 393 | }
|
---|
| 394 | head = head + ch;
|
---|
| 395 | break;
|
---|
| 396 | case SRC_EQUAL:
|
---|
| 397 | switch(lch) {
|
---|
| 398 | case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
|
---|
| 399 | case '\'': case '\"': state = SRC_Q; break;
|
---|
| 400 | default: state = SRC_NQ; break;
|
---|
| 401 | }
|
---|
| 402 | if(state == SRC_NQ) {
|
---|
| 403 | url_str = url_str + ch;
|
---|
| 404 | } else {
|
---|
| 405 | head = head + ch;
|
---|
| 406 | }
|
---|
| 407 | break;
|
---|
| 408 | case SRC_NQ:
|
---|
| 409 | switch(lch) {
|
---|
| 410 | case ' ': case '\t': case '\n': case '\r':
|
---|
| 411 | state = SRC_FINAL;
|
---|
| 412 | tail = ch + tail;
|
---|
| 413 | break;
|
---|
| 414 | default: url_str = url_str + ch; break;
|
---|
| 415 | }
|
---|
| 416 | break;
|
---|
| 417 | case SRC_Q:
|
---|
| 418 | switch(lch) {
|
---|
| 419 | case '\'': case '\"':
|
---|
| 420 | state = SRC_FINAL;
|
---|
| 421 | tail = ch + tail;
|
---|
| 422 | break;
|
---|
| 423 | default: url_str = url_str + ch; break;
|
---|
| 424 | }
|
---|
| 425 | break;
|
---|
| 426 | // A possible link-src combo
|
---|
| 427 | case L:
|
---|
| 428 | switch(lch) {
|
---|
| 429 | case 'I': case 'i': state = LI; break;
|
---|
| 430 | default: state = GROUND; break;
|
---|
| 431 | }
|
---|
| 432 | head = head + ch;
|
---|
| 433 | break;
|
---|
| 434 | case LI:
|
---|
| 435 | switch(lch) {
|
---|
| 436 | case 'N': case 'n': state = LIN; break;
|
---|
| 437 | default: state = GROUND; break;
|
---|
| 438 | }
|
---|
| 439 | head = head + ch;
|
---|
| 440 | break;
|
---|
| 441 | case LIN:
|
---|
| 442 | switch(lch) {
|
---|
| 443 | case 'K': case 'k': state = LINK; break;
|
---|
| 444 | default: state = GROUND; break;
|
---|
| 445 | }
|
---|
| 446 | head = head + ch;
|
---|
| 447 | break;
|
---|
| 448 | case LINK:
|
---|
| 449 | switch(lch) {
|
---|
| 450 | case 'H': case 'h': state = LINK_H; break;
|
---|
| 451 | default: state = LINK; break;
|
---|
| 452 | }
|
---|
| 453 | head = head + ch;
|
---|
| 454 | break;
|
---|
| 455 | case LINK_H:
|
---|
| 456 | switch(lch) {
|
---|
| 457 | case 'R': case 'r': state = LINK_HR; break;
|
---|
| 458 | default: state = LINK; break;
|
---|
| 459 | }
|
---|
| 460 | head = head + ch;
|
---|
| 461 | break;
|
---|
| 462 | case LINK_HR:
|
---|
| 463 | switch(lch) {
|
---|
| 464 | case 'E': case 'e': state = LINK_HRE; break;
|
---|
| 465 | default: state = LINK; break;
|
---|
| 466 | }
|
---|
| 467 | head = head + ch;
|
---|
| 468 | break;
|
---|
| 469 | case LINK_HRE:
|
---|
| 470 | switch(lch) {
|
---|
| 471 | case 'F': case 'f': state = LINK_HREF; break;
|
---|
| 472 | default: state = LINK; break;
|
---|
| 473 | }
|
---|
| 474 | head = head + ch;
|
---|
| 475 | break;
|
---|
| 476 | case LINK_HREF:
|
---|
| 477 | switch(lch) {
|
---|
| 478 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 479 | case '=': state = LINK_EQUAL; break;
|
---|
| 480 | default: state = GROUND; break;
|
---|
| 481 | }
|
---|
| 482 | head = head + ch;
|
---|
| 483 | break;
|
---|
| 484 | case LINK_EQUAL:
|
---|
| 485 | switch(lch) {
|
---|
| 486 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 487 | case '\'': case '\"': state = LINK_Q; break;
|
---|
| 488 | default: state = LINK_NQ; break;
|
---|
| 489 | }
|
---|
| 490 | if(state == LINK_NQ) {
|
---|
| 491 | url_str = url_str + ch;
|
---|
| 492 | } else {
|
---|
| 493 | head = head + ch;
|
---|
| 494 | }
|
---|
| 495 | break;
|
---|
| 496 | case LINK_NQ:
|
---|
| 497 | switch(lch) {
|
---|
| 498 | case ' ': case '\t': case '\n': case '\r':
|
---|
| 499 | state = LINK_FINAL;
|
---|
| 500 | tail = ch + tail;
|
---|
| 501 | break;
|
---|
| 502 | default: url_str = url_str + ch; break;
|
---|
| 503 | }
|
---|
| 504 | break;
|
---|
| 505 | case LINK_Q:
|
---|
| 506 | switch(lch) {
|
---|
| 507 | case '\'': case '\"':
|
---|
| 508 | state = LINK_FINAL;
|
---|
| 509 | tail = ch + tail;
|
---|
| 510 | break;
|
---|
| 511 | default: url_str = url_str + ch; break;
|
---|
| 512 | }
|
---|
| 513 | break;
|
---|
| 514 | // A possible background
|
---|
| 515 | case B:
|
---|
| 516 | switch(lch) {
|
---|
| 517 | case 'A': case 'a': state = BA; break;
|
---|
| 518 | default: state = GROUND; break;
|
---|
| 519 | }
|
---|
| 520 | head = head + ch;
|
---|
| 521 | break;
|
---|
| 522 | case BA:
|
---|
| 523 | switch(lch) {
|
---|
| 524 | case 'C': case 'c': state = BAC; break;
|
---|
| 525 | default: state = GROUND; break;
|
---|
| 526 | }
|
---|
| 527 | head = head + ch;
|
---|
| 528 | break;
|
---|
| 529 | case BAC:
|
---|
| 530 | switch(lch) {
|
---|
| 531 | case 'K': case 'k': state = BACK; break;
|
---|
| 532 | default: state = GROUND; break;
|
---|
| 533 | }
|
---|
| 534 | head = head + ch;
|
---|
| 535 | break;
|
---|
| 536 | case BACK:
|
---|
| 537 | switch(lch) {
|
---|
| 538 | case 'G': case 'g': state = BACKG; break;
|
---|
| 539 | default: state = GROUND; break;
|
---|
| 540 | }
|
---|
| 541 | head = head + ch;
|
---|
| 542 | break;
|
---|
| 543 | case BACKG:
|
---|
| 544 | switch(lch) {
|
---|
| 545 | case 'R': case 'r': state = BACKGR; break;
|
---|
| 546 | default: state = GROUND; break;
|
---|
| 547 | }
|
---|
| 548 | head = head + ch;
|
---|
| 549 | break;
|
---|
| 550 | case BACKGR:
|
---|
| 551 | switch(lch) {
|
---|
| 552 | case 'O': case 'o': state = BACKGRO; break;
|
---|
| 553 | default: state = GROUND; break;
|
---|
| 554 | }
|
---|
| 555 | head = head + ch;
|
---|
| 556 | break;
|
---|
| 557 | case BACKGRO:
|
---|
| 558 | switch(lch) {
|
---|
| 559 | case 'U': case 'u': state = BACKGROU; break;
|
---|
| 560 | default: state = GROUND; break;
|
---|
| 561 | }
|
---|
| 562 | head = head + ch;
|
---|
| 563 | break;
|
---|
| 564 | case BACKGROU:
|
---|
| 565 | switch(lch) {
|
---|
| 566 | case 'N': case 'n': state = BACKGROUN; break;
|
---|
| 567 | default: state = GROUND; break;
|
---|
| 568 | }
|
---|
| 569 | head = head + ch;
|
---|
| 570 | break;
|
---|
| 571 | case BACKGROUN:
|
---|
| 572 | switch(lch) {
|
---|
| 573 | case 'D': case 'd': state = BACKGROUND; break;
|
---|
| 574 | default: state = GROUND; break;
|
---|
| 575 | }
|
---|
| 576 | head = head + ch;
|
---|
| 577 | break;
|
---|
| 578 | case BACKGROUND:
|
---|
| 579 | switch(lch) {
|
---|
| 580 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 581 | case '=': state = BACKGROUND_EQUAL; break;
|
---|
| 582 | default: state = GROUND; break;
|
---|
| 583 | }
|
---|
| 584 | head = head + ch;
|
---|
| 585 | break;
|
---|
| 586 | case BACKGROUND_EQUAL:
|
---|
| 587 | switch(lch) {
|
---|
| 588 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
| 589 | case '\'': case '\"': state = BACKGROUND_Q; break;
|
---|
| 590 | default: state = BACKGROUND_NQ; break;
|
---|
| 591 | }
|
---|
| 592 | if(state == BACKGROUND_NQ) {
|
---|
| 593 | url_str = url_str + ch;
|
---|
| 594 | } else {
|
---|
| 595 | head = head + ch;
|
---|
| 596 | }
|
---|
| 597 | break;
|
---|
| 598 | case BACKGROUND_NQ:
|
---|
| 599 | switch(lch) {
|
---|
| 600 | case ' ': case '\t': case '\n': case '\r':
|
---|
| 601 | state = BACKGROUND_FINAL;
|
---|
| 602 | tail = ch + tail;
|
---|
| 603 | break;
|
---|
| 604 | default: url_str = url_str + ch; break;
|
---|
| 605 | }
|
---|
| 606 | break;
|
---|
| 607 | case BACKGROUND_Q:
|
---|
| 608 | switch(lch) {
|
---|
| 609 | case '\'': case '\"':
|
---|
| 610 | state = BACKGROUND_FINAL;
|
---|
| 611 | tail = ch + tail;
|
---|
| 612 | break;
|
---|
| 613 | default: url_str = url_str + ch; break;
|
---|
| 614 | }
|
---|
| 615 | break;
|
---|
| 616 | }
|
---|
| 617 | }
|
---|
[11418] | 618 |
|
---|
| 619 | url_str = url_str.replaceAll("&","&");
|
---|
| 620 |
|
---|
[6816] | 621 | if(state == HREF_FINAL ) {
|
---|
| 622 | try {
|
---|
| 623 | URL new_url = new URL(url, url_str);
|
---|
| 624 | href_links.addElement(new_url);
|
---|
| 625 | }
|
---|
| 626 | catch (Exception e) {
|
---|
| 627 | e.printStackTrace();
|
---|
| 628 | }
|
---|
| 629 | }
|
---|
| 630 |
|
---|
| 631 | if(state == SRC_FINAL ) {
|
---|
| 632 | try {
|
---|
| 633 | URL new_url = new URL(url, url_str);
|
---|
| 634 | src_links.addElement(new_url);
|
---|
| 635 | }
|
---|
| 636 | catch (Exception e) {
|
---|
| 637 | e.printStackTrace();
|
---|
| 638 | }
|
---|
| 639 | }
|
---|
| 640 |
|
---|
| 641 | if(state == LINK_FINAL ) {
|
---|
| 642 | try {
|
---|
| 643 | URL new_url = new URL(url, url_str);
|
---|
| 644 | link_links.addElement(new_url);
|
---|
| 645 | }
|
---|
| 646 | catch (Exception e) {
|
---|
| 647 | e.printStackTrace();
|
---|
| 648 | }
|
---|
| 649 | }
|
---|
| 650 |
|
---|
| 651 | if(state == BACKGROUND_FINAL ) {
|
---|
| 652 | try {
|
---|
| 653 | URL new_url = new URL(url, url_str);
|
---|
| 654 | background_links.addElement(new_url);
|
---|
| 655 | }
|
---|
| 656 | catch (Exception e) {
|
---|
| 657 | e.printStackTrace();
|
---|
| 658 | }
|
---|
| 659 | }
|
---|
| 660 |
|
---|
| 661 | return head + url + tail;
|
---|
| 662 | }
|
---|
| 663 |
|
---|
| 664 | static private String guessContentType(String text) {
|
---|
| 665 | if(text.endsWith("/")) {
|
---|
| 666 | return "text/html";
|
---|
| 667 | } else if (text.endsWith(".html")) {
|
---|
| 668 | return "text/html";
|
---|
| 669 | } else if (text.endsWith(".htm")) {
|
---|
| 670 | return "text/html";
|
---|
| 671 | } else if (text.indexOf("?")>0) {
|
---|
| 672 | return "text/html";
|
---|
| 673 | }
|
---|
| 674 | return "image/jpeg";
|
---|
| 675 | }
|
---|
| 676 |
|
---|
| 677 | }
|
---|