1 | package org.nzdl.gsdl.GsdlCollageApplet;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.net.*;
|
---|
5 | import java.util.*;
|
---|
6 |
|
---|
7 | /** Examines html pages and extracts all the images and links */
|
---|
8 | public class CURL {
|
---|
9 |
|
---|
10 | private boolean url_valid = true;
|
---|
11 | private InputStream input = null;
|
---|
12 | private int peek_value = -1;
|
---|
13 | private String buffer = "";
|
---|
14 | private URL url = null;
|
---|
15 | private Vector href_links = null;
|
---|
16 | private Vector src_links = null;
|
---|
17 | private Vector link_links = null;
|
---|
18 | private Vector background_links = null;
|
---|
19 |
|
---|
20 | /** Starts processing the given url for images and links
|
---|
21 | * @param url_str The url to examine */
|
---|
22 | public CURL(String url_str) {
|
---|
23 | href_links = new Vector();
|
---|
24 | src_links = new Vector();
|
---|
25 | link_links = new Vector();
|
---|
26 | background_links = new Vector();
|
---|
27 |
|
---|
28 | try {
|
---|
29 | url = new URL(url_str);
|
---|
30 | input = url.openStream();
|
---|
31 | // int value = 0;
|
---|
32 |
|
---|
33 | // while ( (value=input.read())!=-1){
|
---|
34 | //System.err.print((char)value);
|
---|
35 | //}
|
---|
36 |
|
---|
37 | //url = new URL(url_str);
|
---|
38 | //input = url.openStream();
|
---|
39 |
|
---|
40 | }
|
---|
41 | catch (MalformedURLException e) {
|
---|
42 |
|
---|
43 | url_valid = false;
|
---|
44 | }
|
---|
45 | catch (IOException e) {
|
---|
46 |
|
---|
47 | url_valid = false;
|
---|
48 | }
|
---|
49 | }
|
---|
50 |
|
---|
51 | /** Checks that a valid connection to the url has been made */
|
---|
52 | public boolean connected_ok()
|
---|
53 | {
|
---|
54 | return url_valid;
|
---|
55 | }
|
---|
56 |
|
---|
57 | /** Gets any href links from this url
|
---|
58 | * @return Vector of href links */
|
---|
59 | public Vector getHrefLinks() {
|
---|
60 | return href_links;
|
---|
61 | }
|
---|
62 | /** Gets any source links from this url
|
---|
63 | * @return Vector of source links */
|
---|
64 | public Vector getSrcLinks() {
|
---|
65 | return src_links;
|
---|
66 | }
|
---|
67 | /** Gets any other links from this url
|
---|
68 | * @return Vector of other links */
|
---|
69 | public Vector getLinkLinks() {
|
---|
70 | return link_links;
|
---|
71 | }
|
---|
72 | /** Gets any background links from this url
|
---|
73 | * @return Vector of background links */
|
---|
74 | public Vector getBackgroundLinks() {
|
---|
75 | return background_links;
|
---|
76 | }
|
---|
77 |
|
---|
78 | /** Gets the url currently being processed */
|
---|
79 | public URL getURL() {
|
---|
80 | return url;
|
---|
81 | }
|
---|
82 | /** Checks that the content of the url is in html */
|
---|
83 | public boolean isHTML() {
|
---|
84 |
|
---|
85 | String content_type = guessContentType(url.toString());
|
---|
86 |
|
---|
87 | if(content_type.startsWith("text/html")) {
|
---|
88 | return true;
|
---|
89 | }
|
---|
90 | return false;
|
---|
91 | }
|
---|
92 |
|
---|
93 | /** Reads a value from the buffer
|
---|
94 | * @return Value read if successful and -1 if not */
|
---|
95 | public int read() {
|
---|
96 | int value = -1;
|
---|
97 | if(isHTML()) {
|
---|
98 | if(buffer.length() == 0) {
|
---|
99 | refill();
|
---|
100 | }
|
---|
101 | if(buffer.length() != 0) {
|
---|
102 | value = getBuffer();
|
---|
103 | }
|
---|
104 | } else {
|
---|
105 | value = getRaw();
|
---|
106 | }
|
---|
107 | return value;
|
---|
108 | }
|
---|
109 | /** Reads the entire URL */
|
---|
110 | public void readAll() {
|
---|
111 | int value;
|
---|
112 | while((value = read()) != -1) {
|
---|
113 | }
|
---|
114 | }
|
---|
115 |
|
---|
116 | // Gets the head of the buffered buffer.
|
---|
117 | private int getBuffer() {
|
---|
118 | if(buffer.length() > 0) {
|
---|
119 | int value = buffer.charAt(0);
|
---|
120 | buffer = buffer.substring(1, buffer.length());
|
---|
121 | return value;
|
---|
122 | } else {
|
---|
123 | System.err.println("Called getRaw on an empty string");
|
---|
124 | return -1;
|
---|
125 | }
|
---|
126 | }
|
---|
127 | // Gets the head of the raw buffer.
|
---|
128 | private int getRaw() {
|
---|
129 | int value = -1;
|
---|
130 | if(peek_value != -1) {
|
---|
131 | value = peek_value;
|
---|
132 | peek_value = -1;
|
---|
133 | }
|
---|
134 | else {
|
---|
135 | try {
|
---|
136 | value = input.read();
|
---|
137 | } catch (Exception e) {
|
---|
138 | e.printStackTrace();
|
---|
139 | }
|
---|
140 | }
|
---|
141 | return value;
|
---|
142 | }
|
---|
143 |
|
---|
144 | private int peekRaw() {
|
---|
145 | if(peek_value == -1) {
|
---|
146 | peek_value = getRaw();
|
---|
147 | }
|
---|
148 | return peek_value;
|
---|
149 | }
|
---|
150 |
|
---|
151 | // Refills the buffered buffer with the next tag or non-tag block
|
---|
152 | // The tag is checked for urls. Note a tag is taken to be < .. > or
|
---|
153 | // < .. < so comments are supported, but comment blocks are still
|
---|
154 | // scanned.
|
---|
155 | private void refill() {
|
---|
156 | int value = getRaw();
|
---|
157 | if(value != -1) {
|
---|
158 |
|
---|
159 | if(value == '<') {
|
---|
160 | //System.err.println("Parsing a tag starting " + (char)value);
|
---|
161 | // Add opening < to buffer
|
---|
162 | setBuffer(value);
|
---|
163 | String tag = "";
|
---|
164 | value = getRaw();
|
---|
165 | while(value != -1 && peekRaw() != '<' && value != '>') {
|
---|
166 | //System.err.print((char)value);
|
---|
167 | tag = tag + (char) value;
|
---|
168 | value = getRaw();
|
---|
169 | }
|
---|
170 |
|
---|
171 | //System.err.println("Read a " + (char)value);
|
---|
172 | //tag = smartLower(tag);
|
---|
173 | //System.err.println("tag "+tag);
|
---|
174 | tag = findURL(tag);
|
---|
175 | buffer = buffer + tag;
|
---|
176 | // Add closing > to buffer
|
---|
177 | setBuffer(value);
|
---|
178 | //System.err.println("Finished tag");
|
---|
179 | } else {
|
---|
180 | //System.err.println("Parsing content");
|
---|
181 | //System.err.println("Value = " + value + " = '" + (char)value + "'");
|
---|
182 | while(value != -1 && value != '<') {
|
---|
183 | //System.err.println("Read a '" + (char)value + "'");
|
---|
184 | setBuffer(value);
|
---|
185 | value = getRaw();
|
---|
186 | }
|
---|
187 | // If we've accidently read the '<' push it back in the stream by
|
---|
188 | // setting peek_value to value. Since the peek_value will be returned
|
---|
189 | // on the next read this has the desired effect.
|
---|
190 | if(value == '<') {
|
---|
191 | peek_value = value;
|
---|
192 | }
|
---|
193 | //System.err.println("Read " + buffer);
|
---|
194 | //System.err.println("Finished Content");
|
---|
195 | }
|
---|
196 | }
|
---|
197 | }
|
---|
198 |
|
---|
199 | // Sets the tail of the buffered buffer.
|
---|
200 | private void setBuffer(int value) {
|
---|
201 | buffer = buffer + (char) value;
|
---|
202 | }
|
---|
203 |
|
---|
204 | private String smartLower(String tag) {
|
---|
205 | boolean lower = true;
|
---|
206 | String new_tag = "";
|
---|
207 | for(int i = 0; i < tag.length(); i++) {
|
---|
208 | // Disable case lowering for value tags (bound by "")
|
---|
209 | if (tag.charAt(i) == '"') {
|
---|
210 | if(lower) {
|
---|
211 | lower = false;
|
---|
212 | } else {
|
---|
213 | lower = true;
|
---|
214 | }
|
---|
215 | }
|
---|
216 | // Lower everything else
|
---|
217 | if(lower) {
|
---|
218 | new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
|
---|
219 | } else {
|
---|
220 | new_tag = new_tag + tag.charAt(i);
|
---|
221 | }
|
---|
222 | }
|
---|
223 | return new_tag;
|
---|
224 | }
|
---|
225 |
|
---|
226 | private final static int GROUND = 0;
|
---|
227 | private final static int COMMENT = 5;
|
---|
228 | private final static int COMMENT_DASH = 6;
|
---|
229 | private final static int COMMENT_FINAL = 7;
|
---|
230 | private final static int H = 11;
|
---|
231 | private final static int HR = 12;
|
---|
232 | private final static int HRE = 13;
|
---|
233 | private final static int HREF = 14;
|
---|
234 | private final static int HREF_EQUAL = 15;
|
---|
235 | private final static int HREF_Q = 16;
|
---|
236 | private final static int HREF_NQ = 17;
|
---|
237 | private final static int HREF_FINAL = 18;
|
---|
238 | private final static int S = 23;
|
---|
239 | private final static int SR = 24;
|
---|
240 | private final static int SRC = 25;
|
---|
241 | private final static int SRC_EQUAL = 26;
|
---|
242 | private final static int SRC_Q = 27;
|
---|
243 | private final static int SRC_NQ = 28;
|
---|
244 | private final static int SRC_FINAL = 29;
|
---|
245 | private final static int L = 67;
|
---|
246 | private final static int LI = 68;
|
---|
247 | private final static int LIN = 69;
|
---|
248 | private final static int LINK = 70;
|
---|
249 | private final static int LINK_QUOTE = 72;
|
---|
250 | private final static int LINK_H = 73;
|
---|
251 | private final static int LINK_HR = 74;
|
---|
252 | private final static int LINK_HRE = 75;
|
---|
253 | private final static int LINK_HREF = 76;
|
---|
254 | private final static int LINK_EQUAL = 77;
|
---|
255 | private final static int LINK_Q = 78;
|
---|
256 | private final static int LINK_NQ = 79;
|
---|
257 | private final static int LINK_FINAL = 80;
|
---|
258 | private final static int B = 85;
|
---|
259 | private final static int BA = 86;
|
---|
260 | private final static int BAC = 87;
|
---|
261 | private final static int BACK = 88;
|
---|
262 | private final static int BACKG = 89;
|
---|
263 | private final static int BACKGR = 90;
|
---|
264 | private final static int BACKGRO = 91;
|
---|
265 | private final static int BACKGROU = 92;
|
---|
266 | private final static int BACKGROUN = 93;
|
---|
267 | private final static int BACKGROUND = 94;
|
---|
268 | private final static int BACKGROUND_EQUAL = 95;
|
---|
269 | private final static int BACKGROUND_Q = 96;
|
---|
270 | private final static int BACKGROUND_NQ = 97;
|
---|
271 | private final static int BACKGROUND_FINAL = 98;
|
---|
272 | private final static int FINAL = 99;
|
---|
273 |
|
---|
274 | // Run the finite-state machine on a buffer-load.
|
---|
275 | private String findURL(String tail) {
|
---|
276 | int state = GROUND;
|
---|
277 | String head = "";
|
---|
278 | String url_str = "";
|
---|
279 | // Sift through the tag for urls
|
---|
280 | while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
|
---|
281 | char ch = tail.charAt(0);
|
---|
282 | String sch = "" + ch;
|
---|
283 | sch = sch.toLowerCase();
|
---|
284 | char lch = sch.charAt(0);
|
---|
285 | tail = tail.substring(1);
|
---|
286 | switch (state) {
|
---|
287 | // Initial state.
|
---|
288 | case GROUND:
|
---|
289 | switch (lch) {
|
---|
290 | case '!': state = COMMENT; break;
|
---|
291 | case 'B': case 'b': state = B; break;
|
---|
292 | case 'H': case 'h': state = H; break;
|
---|
293 | case 'L': case 'l': state = L; break;
|
---|
294 | case 'S': case 's': state = S; break;
|
---|
295 | }
|
---|
296 | head = head + ch;
|
---|
297 | break;
|
---|
298 | // A possible comment
|
---|
299 | case COMMENT:
|
---|
300 | switch(lch) {
|
---|
301 | case '-': state = COMMENT_DASH; break;
|
---|
302 | default: state = GROUND; break;
|
---|
303 | }
|
---|
304 | head = head + ch;
|
---|
305 | break;
|
---|
306 | case COMMENT_DASH:
|
---|
307 | switch(lch) {
|
---|
308 | case '-': state = COMMENT_FINAL; break; // skip comments
|
---|
309 | default: state = GROUND; break;
|
---|
310 | }
|
---|
311 | head = head + ch;
|
---|
312 | break;
|
---|
313 | // A possible href
|
---|
314 | case H:
|
---|
315 | switch(lch) {
|
---|
316 | case 'R': case 'r': state = HR; break;
|
---|
317 | default: state = GROUND; break;
|
---|
318 | }
|
---|
319 | head = head + ch;
|
---|
320 | break;
|
---|
321 | case HR:
|
---|
322 | switch(lch) {
|
---|
323 | case 'E': case 'e': state = HRE; break;
|
---|
324 | default: state = GROUND; break;
|
---|
325 | }
|
---|
326 | head = head + ch;
|
---|
327 | break;
|
---|
328 | case HRE:
|
---|
329 | switch(lch) {
|
---|
330 | case 'F': case 'f': state = HREF; break;
|
---|
331 | default: state = GROUND; break;
|
---|
332 | }
|
---|
333 | head = head + ch;
|
---|
334 | break;
|
---|
335 | case HREF:
|
---|
336 | switch(lch) {
|
---|
337 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
338 | case '=': state = HREF_EQUAL; break;
|
---|
339 | default: state = GROUND; break;
|
---|
340 | }
|
---|
341 | head = head + ch;
|
---|
342 | break;
|
---|
343 | case HREF_EQUAL:
|
---|
344 | switch(lch) {
|
---|
345 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
346 | case '\'': case '\"': state = HREF_Q; break;
|
---|
347 | default: state = HREF_NQ; break;
|
---|
348 | }
|
---|
349 | if(state == HREF_NQ) {
|
---|
350 | url_str = url_str + ch;
|
---|
351 | } else {
|
---|
352 | head = head + ch;
|
---|
353 | }
|
---|
354 | break;
|
---|
355 | case HREF_NQ:
|
---|
356 | switch(lch) {
|
---|
357 | case ' ': case '\t': case '\n': case '\r':
|
---|
358 | state = HREF_FINAL;
|
---|
359 | tail = ch + tail;
|
---|
360 | break;
|
---|
361 | default: url_str = url_str + ch; break;
|
---|
362 | }
|
---|
363 | break;
|
---|
364 | case HREF_Q:
|
---|
365 | switch(lch) {
|
---|
366 | case '\'': case '\"':
|
---|
367 | state = HREF_FINAL;
|
---|
368 | tail = ch + tail;
|
---|
369 | break;
|
---|
370 | default: url_str = url_str + ch; break;
|
---|
371 | }
|
---|
372 | break;
|
---|
373 | // A possible src
|
---|
374 | case S:
|
---|
375 | switch(lch) {
|
---|
376 | case 'R': case 'r': state = SR; break;
|
---|
377 | default: state = GROUND; break;
|
---|
378 | }
|
---|
379 | head = head + ch;
|
---|
380 | break;
|
---|
381 | case SR:
|
---|
382 | switch(lch) {
|
---|
383 | case 'C': case 'c': state = SRC; break;
|
---|
384 | default: state = GROUND; break;
|
---|
385 | }
|
---|
386 | head = head + ch;
|
---|
387 | break;
|
---|
388 | case SRC:
|
---|
389 | switch(lch) {
|
---|
390 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
391 | case '=': state = SRC_EQUAL; break;
|
---|
392 | default: state = GROUND; break;
|
---|
393 | }
|
---|
394 | head = head + ch;
|
---|
395 | break;
|
---|
396 | case SRC_EQUAL:
|
---|
397 | switch(lch) {
|
---|
398 | case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
|
---|
399 | case '\'': case '\"': state = SRC_Q; break;
|
---|
400 | default: state = SRC_NQ; break;
|
---|
401 | }
|
---|
402 | if(state == SRC_NQ) {
|
---|
403 | url_str = url_str + ch;
|
---|
404 | } else {
|
---|
405 | head = head + ch;
|
---|
406 | }
|
---|
407 | break;
|
---|
408 | case SRC_NQ:
|
---|
409 | switch(lch) {
|
---|
410 | case ' ': case '\t': case '\n': case '\r':
|
---|
411 | state = SRC_FINAL;
|
---|
412 | tail = ch + tail;
|
---|
413 | break;
|
---|
414 | default: url_str = url_str + ch; break;
|
---|
415 | }
|
---|
416 | break;
|
---|
417 | case SRC_Q:
|
---|
418 | switch(lch) {
|
---|
419 | case '\'': case '\"':
|
---|
420 | state = SRC_FINAL;
|
---|
421 | tail = ch + tail;
|
---|
422 | break;
|
---|
423 | default: url_str = url_str + ch; break;
|
---|
424 | }
|
---|
425 | break;
|
---|
426 | // A possible link-src combo
|
---|
427 | case L:
|
---|
428 | switch(lch) {
|
---|
429 | case 'I': case 'i': state = LI; break;
|
---|
430 | default: state = GROUND; break;
|
---|
431 | }
|
---|
432 | head = head + ch;
|
---|
433 | break;
|
---|
434 | case LI:
|
---|
435 | switch(lch) {
|
---|
436 | case 'N': case 'n': state = LIN; break;
|
---|
437 | default: state = GROUND; break;
|
---|
438 | }
|
---|
439 | head = head + ch;
|
---|
440 | break;
|
---|
441 | case LIN:
|
---|
442 | switch(lch) {
|
---|
443 | case 'K': case 'k': state = LINK; break;
|
---|
444 | default: state = GROUND; break;
|
---|
445 | }
|
---|
446 | head = head + ch;
|
---|
447 | break;
|
---|
448 | case LINK:
|
---|
449 | switch(lch) {
|
---|
450 | case 'H': case 'h': state = LINK_H; break;
|
---|
451 | default: state = LINK; break;
|
---|
452 | }
|
---|
453 | head = head + ch;
|
---|
454 | break;
|
---|
455 | case LINK_H:
|
---|
456 | switch(lch) {
|
---|
457 | case 'R': case 'r': state = LINK_HR; break;
|
---|
458 | default: state = LINK; break;
|
---|
459 | }
|
---|
460 | head = head + ch;
|
---|
461 | break;
|
---|
462 | case LINK_HR:
|
---|
463 | switch(lch) {
|
---|
464 | case 'E': case 'e': state = LINK_HRE; break;
|
---|
465 | default: state = LINK; break;
|
---|
466 | }
|
---|
467 | head = head + ch;
|
---|
468 | break;
|
---|
469 | case LINK_HRE:
|
---|
470 | switch(lch) {
|
---|
471 | case 'F': case 'f': state = LINK_HREF; break;
|
---|
472 | default: state = LINK; break;
|
---|
473 | }
|
---|
474 | head = head + ch;
|
---|
475 | break;
|
---|
476 | case LINK_HREF:
|
---|
477 | switch(lch) {
|
---|
478 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
479 | case '=': state = LINK_EQUAL; break;
|
---|
480 | default: state = GROUND; break;
|
---|
481 | }
|
---|
482 | head = head + ch;
|
---|
483 | break;
|
---|
484 | case LINK_EQUAL:
|
---|
485 | switch(lch) {
|
---|
486 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
487 | case '\'': case '\"': state = LINK_Q; break;
|
---|
488 | default: state = LINK_NQ; break;
|
---|
489 | }
|
---|
490 | if(state == LINK_NQ) {
|
---|
491 | url_str = url_str + ch;
|
---|
492 | } else {
|
---|
493 | head = head + ch;
|
---|
494 | }
|
---|
495 | break;
|
---|
496 | case LINK_NQ:
|
---|
497 | switch(lch) {
|
---|
498 | case ' ': case '\t': case '\n': case '\r':
|
---|
499 | state = LINK_FINAL;
|
---|
500 | tail = ch + tail;
|
---|
501 | break;
|
---|
502 | default: url_str = url_str + ch; break;
|
---|
503 | }
|
---|
504 | break;
|
---|
505 | case LINK_Q:
|
---|
506 | switch(lch) {
|
---|
507 | case '\'': case '\"':
|
---|
508 | state = LINK_FINAL;
|
---|
509 | tail = ch + tail;
|
---|
510 | break;
|
---|
511 | default: url_str = url_str + ch; break;
|
---|
512 | }
|
---|
513 | break;
|
---|
514 | // A possible background
|
---|
515 | case B:
|
---|
516 | switch(lch) {
|
---|
517 | case 'A': case 'a': state = BA; break;
|
---|
518 | default: state = GROUND; break;
|
---|
519 | }
|
---|
520 | head = head + ch;
|
---|
521 | break;
|
---|
522 | case BA:
|
---|
523 | switch(lch) {
|
---|
524 | case 'C': case 'c': state = BAC; break;
|
---|
525 | default: state = GROUND; break;
|
---|
526 | }
|
---|
527 | head = head + ch;
|
---|
528 | break;
|
---|
529 | case BAC:
|
---|
530 | switch(lch) {
|
---|
531 | case 'K': case 'k': state = BACK; break;
|
---|
532 | default: state = GROUND; break;
|
---|
533 | }
|
---|
534 | head = head + ch;
|
---|
535 | break;
|
---|
536 | case BACK:
|
---|
537 | switch(lch) {
|
---|
538 | case 'G': case 'g': state = BACKG; break;
|
---|
539 | default: state = GROUND; break;
|
---|
540 | }
|
---|
541 | head = head + ch;
|
---|
542 | break;
|
---|
543 | case BACKG:
|
---|
544 | switch(lch) {
|
---|
545 | case 'R': case 'r': state = BACKGR; break;
|
---|
546 | default: state = GROUND; break;
|
---|
547 | }
|
---|
548 | head = head + ch;
|
---|
549 | break;
|
---|
550 | case BACKGR:
|
---|
551 | switch(lch) {
|
---|
552 | case 'O': case 'o': state = BACKGRO; break;
|
---|
553 | default: state = GROUND; break;
|
---|
554 | }
|
---|
555 | head = head + ch;
|
---|
556 | break;
|
---|
557 | case BACKGRO:
|
---|
558 | switch(lch) {
|
---|
559 | case 'U': case 'u': state = BACKGROU; break;
|
---|
560 | default: state = GROUND; break;
|
---|
561 | }
|
---|
562 | head = head + ch;
|
---|
563 | break;
|
---|
564 | case BACKGROU:
|
---|
565 | switch(lch) {
|
---|
566 | case 'N': case 'n': state = BACKGROUN; break;
|
---|
567 | default: state = GROUND; break;
|
---|
568 | }
|
---|
569 | head = head + ch;
|
---|
570 | break;
|
---|
571 | case BACKGROUN:
|
---|
572 | switch(lch) {
|
---|
573 | case 'D': case 'd': state = BACKGROUND; break;
|
---|
574 | default: state = GROUND; break;
|
---|
575 | }
|
---|
576 | head = head + ch;
|
---|
577 | break;
|
---|
578 | case BACKGROUND:
|
---|
579 | switch(lch) {
|
---|
580 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
581 | case '=': state = BACKGROUND_EQUAL; break;
|
---|
582 | default: state = GROUND; break;
|
---|
583 | }
|
---|
584 | head = head + ch;
|
---|
585 | break;
|
---|
586 | case BACKGROUND_EQUAL:
|
---|
587 | switch(lch) {
|
---|
588 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
589 | case '\'': case '\"': state = BACKGROUND_Q; break;
|
---|
590 | default: state = BACKGROUND_NQ; break;
|
---|
591 | }
|
---|
592 | if(state == BACKGROUND_NQ) {
|
---|
593 | url_str = url_str + ch;
|
---|
594 | } else {
|
---|
595 | head = head + ch;
|
---|
596 | }
|
---|
597 | break;
|
---|
598 | case BACKGROUND_NQ:
|
---|
599 | switch(lch) {
|
---|
600 | case ' ': case '\t': case '\n': case '\r':
|
---|
601 | state = BACKGROUND_FINAL;
|
---|
602 | tail = ch + tail;
|
---|
603 | break;
|
---|
604 | default: url_str = url_str + ch; break;
|
---|
605 | }
|
---|
606 | break;
|
---|
607 | case BACKGROUND_Q:
|
---|
608 | switch(lch) {
|
---|
609 | case '\'': case '\"':
|
---|
610 | state = BACKGROUND_FINAL;
|
---|
611 | tail = ch + tail;
|
---|
612 | break;
|
---|
613 | default: url_str = url_str + ch; break;
|
---|
614 | }
|
---|
615 | break;
|
---|
616 | }
|
---|
617 | }
|
---|
618 |
|
---|
619 | url_str = url_str.replaceAll("&","&");
|
---|
620 |
|
---|
621 | if(state == HREF_FINAL ) {
|
---|
622 | try {
|
---|
623 | URL new_url = new URL(url, url_str);
|
---|
624 | href_links.addElement(new_url);
|
---|
625 | }
|
---|
626 | catch (Exception e) {
|
---|
627 | e.printStackTrace();
|
---|
628 | }
|
---|
629 | }
|
---|
630 |
|
---|
631 | if(state == SRC_FINAL ) {
|
---|
632 | try {
|
---|
633 | URL new_url = new URL(url, url_str);
|
---|
634 | src_links.addElement(new_url);
|
---|
635 | }
|
---|
636 | catch (Exception e) {
|
---|
637 | e.printStackTrace();
|
---|
638 | }
|
---|
639 | }
|
---|
640 |
|
---|
641 | if(state == LINK_FINAL ) {
|
---|
642 | try {
|
---|
643 | URL new_url = new URL(url, url_str);
|
---|
644 | link_links.add(new_url);
|
---|
645 | }
|
---|
646 | catch (Exception e) {
|
---|
647 | e.printStackTrace();
|
---|
648 | }
|
---|
649 | }
|
---|
650 |
|
---|
651 | if(state == BACKGROUND_FINAL ) {
|
---|
652 | try {
|
---|
653 | URL new_url = new URL(url, url_str);
|
---|
654 | background_links.add(new_url);
|
---|
655 | }
|
---|
656 | catch (Exception e) {
|
---|
657 | e.printStackTrace();
|
---|
658 | }
|
---|
659 | }
|
---|
660 |
|
---|
661 | return head + url + tail;
|
---|
662 | }
|
---|
663 |
|
---|
664 | static private String guessContentType(String text) {
|
---|
665 | if(text.endsWith("/")) {
|
---|
666 | return "text/html";
|
---|
667 | } else if (text.endsWith(".html")) {
|
---|
668 | return "text/html";
|
---|
669 | } else if (text.endsWith(".htm")) {
|
---|
670 | return "text/html";
|
---|
671 | } else if (text.indexOf("?")>0) {
|
---|
672 | return "text/html";
|
---|
673 | }
|
---|
674 | return "image/jpeg";
|
---|
675 | }
|
---|
676 |
|
---|
677 | }
|
---|