1 | package org.nzdl.gsdl.GsdlCollageApplet;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.net.*;
|
---|
5 | import java.util.*;
|
---|
6 |
|
---|
7 | /** Examines html pages and extracts all the images and links */
|
---|
8 | public class CURL {
|
---|
9 |
|
---|
10 | private boolean url_valid = true;
|
---|
11 | private InputStream input = null;
|
---|
12 | private int peek_value = -1;
|
---|
13 | private String buffer = "";
|
---|
14 | private URL url = null;
|
---|
15 | private Vector href_links = null;
|
---|
16 | private Vector src_links = null;
|
---|
17 | private Vector link_links = null;
|
---|
18 | private Vector background_links = null;
|
---|
19 |
|
---|
20 | /** Starts processing the given url for images and links
|
---|
21 | * @param url_str The url to examine */
|
---|
22 | public CURL(String url_str) {
|
---|
23 | href_links = new Vector();
|
---|
24 | src_links = new Vector();
|
---|
25 | link_links = new Vector();
|
---|
26 | background_links = new Vector();
|
---|
27 |
|
---|
28 | try {
|
---|
29 | url = new URL(url_str);
|
---|
30 | input = url.openStream();
|
---|
31 | }
|
---|
32 | catch (MalformedURLException e) {
|
---|
33 | url_valid = false;
|
---|
34 | }
|
---|
35 | catch (IOException e) {
|
---|
36 | url_valid = false;
|
---|
37 | }
|
---|
38 | }
|
---|
39 |
|
---|
40 | /** Checks that a valid connection to the url has been made */
|
---|
41 | public boolean connected_ok()
|
---|
42 | {
|
---|
43 | return url_valid;
|
---|
44 | }
|
---|
45 |
|
---|
46 | /** Gets any href links from this url
|
---|
47 | * @return Vector of href links */
|
---|
48 | public Vector getHrefLinks() {
|
---|
49 | return href_links;
|
---|
50 | }
|
---|
51 | /** Gets any source links from this url
|
---|
52 | * @return Vector of source links */
|
---|
53 | public Vector getSrcLinks() {
|
---|
54 | return src_links;
|
---|
55 | }
|
---|
56 | /** Gets any other links from this url
|
---|
57 | * @return Vector of other links */
|
---|
58 | public Vector getLinkLinks() {
|
---|
59 | return link_links;
|
---|
60 | }
|
---|
61 | /** Gets any background links from this url
|
---|
62 | * @return Vector of background links */
|
---|
63 | public Vector getBackgroundLinks() {
|
---|
64 | return background_links;
|
---|
65 | }
|
---|
66 |
|
---|
67 | /** Gets the url currently being processed */
|
---|
68 | public URL getURL() {
|
---|
69 | return url;
|
---|
70 | }
|
---|
71 | /** Checks that the content of the url is in html */
|
---|
72 | public boolean isHTML() {
|
---|
73 | String content_type = guessContentType(url.toString());
|
---|
74 |
|
---|
75 | if(content_type.startsWith("text/html")) {
|
---|
76 | return true;
|
---|
77 | }
|
---|
78 | return false;
|
---|
79 | }
|
---|
80 |
|
---|
81 | /** Reads a value from the buffer
|
---|
82 | * @return Value read if successful and -1 if not */
|
---|
83 | public int read() {
|
---|
84 | int value = -1;
|
---|
85 | if(isHTML()) {
|
---|
86 | if(buffer.length() == 0) {
|
---|
87 | refill();
|
---|
88 | }
|
---|
89 | if(buffer.length() != 0) {
|
---|
90 | value = getBuffer();
|
---|
91 | }
|
---|
92 | } else {
|
---|
93 | value = getRaw();
|
---|
94 | }
|
---|
95 | return value;
|
---|
96 | }
|
---|
97 | /** Reads the entire URL */
|
---|
98 | public void readAll() {
|
---|
99 | int value;
|
---|
100 | while((value = read()) != -1) {
|
---|
101 | }
|
---|
102 | }
|
---|
103 |
|
---|
104 | // Gets the head of the buffered buffer.
|
---|
105 | private int getBuffer() {
|
---|
106 | if(buffer.length() > 0) {
|
---|
107 | int value = buffer.charAt(0);
|
---|
108 | buffer = buffer.substring(1, buffer.length());
|
---|
109 | return value;
|
---|
110 | } else {
|
---|
111 | System.err.println("Called getRaw on an empty string");
|
---|
112 | return -1;
|
---|
113 | }
|
---|
114 | }
|
---|
115 | // Gets the head of the raw buffer.
|
---|
116 | private int getRaw() {
|
---|
117 | int value = -1;
|
---|
118 | if(peek_value != -1) {
|
---|
119 | value = peek_value;
|
---|
120 | peek_value = -1;
|
---|
121 | } else {
|
---|
122 | try {
|
---|
123 | value = input.read();
|
---|
124 | } catch (Exception e) {
|
---|
125 | e.printStackTrace();
|
---|
126 | }
|
---|
127 | }
|
---|
128 | return value;
|
---|
129 | }
|
---|
130 |
|
---|
131 | private int peekRaw() {
|
---|
132 | if(peek_value == -1) {
|
---|
133 | peek_value = getRaw();
|
---|
134 | }
|
---|
135 | return peek_value;
|
---|
136 | }
|
---|
137 |
|
---|
138 | // Refills the buffered buffer with the next tag or non-tag block
|
---|
139 | // The tag is checked for urls. Note a tag is taken to be < .. > or
|
---|
140 | // < .. < so comments are supported, but comment blocks are still
|
---|
141 | // scanned.
|
---|
142 | private void refill() {
|
---|
143 | int value = getRaw();
|
---|
144 | if(value != -1) {
|
---|
145 | if(value == '<') {
|
---|
146 | //System.err.println("Parsing a tag starting " + (char)value);
|
---|
147 | // Add opening < to buffer
|
---|
148 | setBuffer(value);
|
---|
149 | String tag = "";
|
---|
150 | value = getRaw();
|
---|
151 | while(value != -1 && peekRaw() != '<' && value != '>') {
|
---|
152 | //System.err.println("Read a " + (char)value);
|
---|
153 | tag = tag + (char) value;
|
---|
154 | value = getRaw();
|
---|
155 | }
|
---|
156 | //System.err.println("Read a " + (char)value);
|
---|
157 | //tag = smartLower(tag);
|
---|
158 | tag = findURL(tag);
|
---|
159 | buffer = buffer + tag;
|
---|
160 | // Add closing > to buffer
|
---|
161 | setBuffer(value);
|
---|
162 | //System.err.println("Finished tag");
|
---|
163 | } else {
|
---|
164 | //System.err.println("Parsing content");
|
---|
165 | //System.err.println("Value = " + value + " = '" + (char)value + "'");
|
---|
166 | while(value != -1 && value != '<') {
|
---|
167 | //System.err.println("Read a '" + (char)value + "'");
|
---|
168 | setBuffer(value);
|
---|
169 | value = getRaw();
|
---|
170 | }
|
---|
171 | // If we've accidently read the '<' push it back in the stream by
|
---|
172 | // setting peek_value to value. Since the peek_value will be returned
|
---|
173 | // on the next read this has the desired effect.
|
---|
174 | if(value == '<') {
|
---|
175 | peek_value = value;
|
---|
176 | }
|
---|
177 | //System.err.println("Read " + buffer);
|
---|
178 | //System.err.println("Finished Content");
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 |
|
---|
183 | // Sets the tail of the buffered buffer.
|
---|
184 | private void setBuffer(int value) {
|
---|
185 | buffer = buffer + (char) value;
|
---|
186 | }
|
---|
187 |
|
---|
188 | private String smartLower(String tag) {
|
---|
189 | boolean lower = true;
|
---|
190 | String new_tag = "";
|
---|
191 | for(int i = 0; i < tag.length(); i++) {
|
---|
192 | // Disable case lowering for value tags (bound by "")
|
---|
193 | if (tag.charAt(i) == '"') {
|
---|
194 | if(lower) {
|
---|
195 | lower = false;
|
---|
196 | } else {
|
---|
197 | lower = true;
|
---|
198 | }
|
---|
199 | }
|
---|
200 | // Lower everything else
|
---|
201 | if(lower) {
|
---|
202 | new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
|
---|
203 | } else {
|
---|
204 | new_tag = new_tag + tag.charAt(i);
|
---|
205 | }
|
---|
206 | }
|
---|
207 | return new_tag;
|
---|
208 | }
|
---|
209 |
|
---|
210 | private final static int GROUND = 0;
|
---|
211 | private final static int COMMENT = 5;
|
---|
212 | private final static int COMMENT_DASH = 6;
|
---|
213 | private final static int COMMENT_FINAL = 7;
|
---|
214 | private final static int H = 11;
|
---|
215 | private final static int HR = 12;
|
---|
216 | private final static int HRE = 13;
|
---|
217 | private final static int HREF = 14;
|
---|
218 | private final static int HREF_EQUAL = 15;
|
---|
219 | private final static int HREF_Q = 16;
|
---|
220 | private final static int HREF_NQ = 17;
|
---|
221 | private final static int HREF_FINAL = 18;
|
---|
222 | private final static int S = 23;
|
---|
223 | private final static int SR = 24;
|
---|
224 | private final static int SRC = 25;
|
---|
225 | private final static int SRC_EQUAL = 26;
|
---|
226 | private final static int SRC_Q = 27;
|
---|
227 | private final static int SRC_NQ = 28;
|
---|
228 | private final static int SRC_FINAL = 29;
|
---|
229 | private final static int L = 67;
|
---|
230 | private final static int LI = 68;
|
---|
231 | private final static int LIN = 69;
|
---|
232 | private final static int LINK = 70;
|
---|
233 | private final static int LINK_QUOTE = 72;
|
---|
234 | private final static int LINK_H = 73;
|
---|
235 | private final static int LINK_HR = 74;
|
---|
236 | private final static int LINK_HRE = 75;
|
---|
237 | private final static int LINK_HREF = 76;
|
---|
238 | private final static int LINK_EQUAL = 77;
|
---|
239 | private final static int LINK_Q = 78;
|
---|
240 | private final static int LINK_NQ = 79;
|
---|
241 | private final static int LINK_FINAL = 80;
|
---|
242 | private final static int B = 85;
|
---|
243 | private final static int BA = 86;
|
---|
244 | private final static int BAC = 87;
|
---|
245 | private final static int BACK = 88;
|
---|
246 | private final static int BACKG = 89;
|
---|
247 | private final static int BACKGR = 90;
|
---|
248 | private final static int BACKGRO = 91;
|
---|
249 | private final static int BACKGROU = 92;
|
---|
250 | private final static int BACKGROUN = 93;
|
---|
251 | private final static int BACKGROUND = 94;
|
---|
252 | private final static int BACKGROUND_EQUAL = 95;
|
---|
253 | private final static int BACKGROUND_Q = 96;
|
---|
254 | private final static int BACKGROUND_NQ = 97;
|
---|
255 | private final static int BACKGROUND_FINAL = 98;
|
---|
256 | private final static int FINAL = 99;
|
---|
257 |
|
---|
258 | // Run the finite-state machine on a buffer-load.
|
---|
259 | private String findURL(String tail) {
|
---|
260 | int state = GROUND;
|
---|
261 | String head = "";
|
---|
262 | String url_str = "";
|
---|
263 | // Sift through the tag for urls
|
---|
264 | while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
|
---|
265 | char ch = tail.charAt(0);
|
---|
266 | String sch = "" + ch;
|
---|
267 | sch = sch.toLowerCase();
|
---|
268 | char lch = sch.charAt(0);
|
---|
269 | tail = tail.substring(1);
|
---|
270 | switch (state) {
|
---|
271 | // Initial state.
|
---|
272 | case GROUND:
|
---|
273 | switch (lch) {
|
---|
274 | case '!': state = COMMENT; break;
|
---|
275 | case 'B': case 'b': state = B; break;
|
---|
276 | case 'H': case 'h': state = H; break;
|
---|
277 | case 'L': case 'l': state = L; break;
|
---|
278 | case 'S': case 's': state = S; break;
|
---|
279 | }
|
---|
280 | head = head + ch;
|
---|
281 | break;
|
---|
282 | // A possible comment
|
---|
283 | case COMMENT:
|
---|
284 | switch(lch) {
|
---|
285 | case '-': state = COMMENT_DASH; break;
|
---|
286 | default: state = GROUND; break;
|
---|
287 | }
|
---|
288 | head = head + ch;
|
---|
289 | break;
|
---|
290 | case COMMENT_DASH:
|
---|
291 | switch(lch) {
|
---|
292 | case '-': state = COMMENT_FINAL; break; // skip comments
|
---|
293 | default: state = GROUND; break;
|
---|
294 | }
|
---|
295 | head = head + ch;
|
---|
296 | break;
|
---|
297 | // A possible href
|
---|
298 | case H:
|
---|
299 | switch(lch) {
|
---|
300 | case 'R': case 'r': state = HR; break;
|
---|
301 | default: state = GROUND; break;
|
---|
302 | }
|
---|
303 | head = head + ch;
|
---|
304 | break;
|
---|
305 | case HR:
|
---|
306 | switch(lch) {
|
---|
307 | case 'E': case 'e': state = HRE; break;
|
---|
308 | default: state = GROUND; break;
|
---|
309 | }
|
---|
310 | head = head + ch;
|
---|
311 | break;
|
---|
312 | case HRE:
|
---|
313 | switch(lch) {
|
---|
314 | case 'F': case 'f': state = HREF; break;
|
---|
315 | default: state = GROUND; break;
|
---|
316 | }
|
---|
317 | head = head + ch;
|
---|
318 | break;
|
---|
319 | case HREF:
|
---|
320 | switch(lch) {
|
---|
321 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
322 | case '=': state = HREF_EQUAL; break;
|
---|
323 | default: state = GROUND; break;
|
---|
324 | }
|
---|
325 | head = head + ch;
|
---|
326 | break;
|
---|
327 | case HREF_EQUAL:
|
---|
328 | switch(lch) {
|
---|
329 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
330 | case '\'': case '\"': state = HREF_Q; break;
|
---|
331 | default: state = HREF_NQ; break;
|
---|
332 | }
|
---|
333 | if(state == HREF_NQ) {
|
---|
334 | url_str = url_str + ch;
|
---|
335 | } else {
|
---|
336 | head = head + ch;
|
---|
337 | }
|
---|
338 | break;
|
---|
339 | case HREF_NQ:
|
---|
340 | switch(lch) {
|
---|
341 | case ' ': case '\t': case '\n': case '\r':
|
---|
342 | state = HREF_FINAL;
|
---|
343 | tail = ch + tail;
|
---|
344 | break;
|
---|
345 | default: url_str = url_str + ch; break;
|
---|
346 | }
|
---|
347 | break;
|
---|
348 | case HREF_Q:
|
---|
349 | switch(lch) {
|
---|
350 | case '\'': case '\"':
|
---|
351 | state = HREF_FINAL;
|
---|
352 | tail = ch + tail;
|
---|
353 | break;
|
---|
354 | default: url_str = url_str + ch; break;
|
---|
355 | }
|
---|
356 | break;
|
---|
357 | // A possible src
|
---|
358 | case S:
|
---|
359 | switch(lch) {
|
---|
360 | case 'R': case 'r': state = SR; break;
|
---|
361 | default: state = GROUND; break;
|
---|
362 | }
|
---|
363 | head = head + ch;
|
---|
364 | break;
|
---|
365 | case SR:
|
---|
366 | switch(lch) {
|
---|
367 | case 'C': case 'c': state = SRC; break;
|
---|
368 | default: state = GROUND; break;
|
---|
369 | }
|
---|
370 | head = head + ch;
|
---|
371 | break;
|
---|
372 | case SRC:
|
---|
373 | switch(lch) {
|
---|
374 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
375 | case '=': state = SRC_EQUAL; break;
|
---|
376 | default: state = GROUND; break;
|
---|
377 | }
|
---|
378 | head = head + ch;
|
---|
379 | break;
|
---|
380 | case SRC_EQUAL:
|
---|
381 | switch(lch) {
|
---|
382 | case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
|
---|
383 | case '\'': case '\"': state = SRC_Q; break;
|
---|
384 | default: state = SRC_NQ; break;
|
---|
385 | }
|
---|
386 | if(state == SRC_NQ) {
|
---|
387 | url_str = url_str + ch;
|
---|
388 | } else {
|
---|
389 | head = head + ch;
|
---|
390 | }
|
---|
391 | break;
|
---|
392 | case SRC_NQ:
|
---|
393 | switch(lch) {
|
---|
394 | case ' ': case '\t': case '\n': case '\r':
|
---|
395 | state = SRC_FINAL;
|
---|
396 | tail = ch + tail;
|
---|
397 | break;
|
---|
398 | default: url_str = url_str + ch; break;
|
---|
399 | }
|
---|
400 | break;
|
---|
401 | case SRC_Q:
|
---|
402 | switch(lch) {
|
---|
403 | case '\'': case '\"':
|
---|
404 | state = SRC_FINAL;
|
---|
405 | tail = ch + tail;
|
---|
406 | break;
|
---|
407 | default: url_str = url_str + ch; break;
|
---|
408 | }
|
---|
409 | break;
|
---|
410 | // A possible link-src combo
|
---|
411 | case L:
|
---|
412 | switch(lch) {
|
---|
413 | case 'I': case 'i': state = LI; break;
|
---|
414 | default: state = GROUND; break;
|
---|
415 | }
|
---|
416 | head = head + ch;
|
---|
417 | break;
|
---|
418 | case LI:
|
---|
419 | switch(lch) {
|
---|
420 | case 'N': case 'n': state = LIN; break;
|
---|
421 | default: state = GROUND; break;
|
---|
422 | }
|
---|
423 | head = head + ch;
|
---|
424 | break;
|
---|
425 | case LIN:
|
---|
426 | switch(lch) {
|
---|
427 | case 'K': case 'k': state = LINK; break;
|
---|
428 | default: state = GROUND; break;
|
---|
429 | }
|
---|
430 | head = head + ch;
|
---|
431 | break;
|
---|
432 | case LINK:
|
---|
433 | switch(lch) {
|
---|
434 | case 'H': case 'h': state = LINK_H; break;
|
---|
435 | default: state = LINK; break;
|
---|
436 | }
|
---|
437 | head = head + ch;
|
---|
438 | break;
|
---|
439 | case LINK_H:
|
---|
440 | switch(lch) {
|
---|
441 | case 'R': case 'r': state = LINK_HR; break;
|
---|
442 | default: state = LINK; break;
|
---|
443 | }
|
---|
444 | head = head + ch;
|
---|
445 | break;
|
---|
446 | case LINK_HR:
|
---|
447 | switch(lch) {
|
---|
448 | case 'E': case 'e': state = LINK_HRE; break;
|
---|
449 | default: state = LINK; break;
|
---|
450 | }
|
---|
451 | head = head + ch;
|
---|
452 | break;
|
---|
453 | case LINK_HRE:
|
---|
454 | switch(lch) {
|
---|
455 | case 'F': case 'f': state = LINK_HREF; break;
|
---|
456 | default: state = LINK; break;
|
---|
457 | }
|
---|
458 | head = head + ch;
|
---|
459 | break;
|
---|
460 | case LINK_HREF:
|
---|
461 | switch(lch) {
|
---|
462 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
463 | case '=': state = LINK_EQUAL; break;
|
---|
464 | default: state = GROUND; break;
|
---|
465 | }
|
---|
466 | head = head + ch;
|
---|
467 | break;
|
---|
468 | case LINK_EQUAL:
|
---|
469 | switch(lch) {
|
---|
470 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
471 | case '\'': case '\"': state = LINK_Q; break;
|
---|
472 | default: state = LINK_NQ; break;
|
---|
473 | }
|
---|
474 | if(state == LINK_NQ) {
|
---|
475 | url_str = url_str + ch;
|
---|
476 | } else {
|
---|
477 | head = head + ch;
|
---|
478 | }
|
---|
479 | break;
|
---|
480 | case LINK_NQ:
|
---|
481 | switch(lch) {
|
---|
482 | case ' ': case '\t': case '\n': case '\r':
|
---|
483 | state = LINK_FINAL;
|
---|
484 | tail = ch + tail;
|
---|
485 | break;
|
---|
486 | default: url_str = url_str + ch; break;
|
---|
487 | }
|
---|
488 | break;
|
---|
489 | case LINK_Q:
|
---|
490 | switch(lch) {
|
---|
491 | case '\'': case '\"':
|
---|
492 | state = LINK_FINAL;
|
---|
493 | tail = ch + tail;
|
---|
494 | break;
|
---|
495 | default: url_str = url_str + ch; break;
|
---|
496 | }
|
---|
497 | break;
|
---|
498 | // A possible background
|
---|
499 | case B:
|
---|
500 | switch(lch) {
|
---|
501 | case 'A': case 'a': state = BA; break;
|
---|
502 | default: state = GROUND; break;
|
---|
503 | }
|
---|
504 | head = head + ch;
|
---|
505 | break;
|
---|
506 | case BA:
|
---|
507 | switch(lch) {
|
---|
508 | case 'C': case 'c': state = BAC; break;
|
---|
509 | default: state = GROUND; break;
|
---|
510 | }
|
---|
511 | head = head + ch;
|
---|
512 | break;
|
---|
513 | case BAC:
|
---|
514 | switch(lch) {
|
---|
515 | case 'K': case 'k': state = BACK; break;
|
---|
516 | default: state = GROUND; break;
|
---|
517 | }
|
---|
518 | head = head + ch;
|
---|
519 | break;
|
---|
520 | case BACK:
|
---|
521 | switch(lch) {
|
---|
522 | case 'G': case 'g': state = BACKG; break;
|
---|
523 | default: state = GROUND; break;
|
---|
524 | }
|
---|
525 | head = head + ch;
|
---|
526 | break;
|
---|
527 | case BACKG:
|
---|
528 | switch(lch) {
|
---|
529 | case 'R': case 'r': state = BACKGR; break;
|
---|
530 | default: state = GROUND; break;
|
---|
531 | }
|
---|
532 | head = head + ch;
|
---|
533 | break;
|
---|
534 | case BACKGR:
|
---|
535 | switch(lch) {
|
---|
536 | case 'O': case 'o': state = BACKGRO; break;
|
---|
537 | default: state = GROUND; break;
|
---|
538 | }
|
---|
539 | head = head + ch;
|
---|
540 | break;
|
---|
541 | case BACKGRO:
|
---|
542 | switch(lch) {
|
---|
543 | case 'U': case 'u': state = BACKGROU; break;
|
---|
544 | default: state = GROUND; break;
|
---|
545 | }
|
---|
546 | head = head + ch;
|
---|
547 | break;
|
---|
548 | case BACKGROU:
|
---|
549 | switch(lch) {
|
---|
550 | case 'N': case 'n': state = BACKGROUN; break;
|
---|
551 | default: state = GROUND; break;
|
---|
552 | }
|
---|
553 | head = head + ch;
|
---|
554 | break;
|
---|
555 | case BACKGROUN:
|
---|
556 | switch(lch) {
|
---|
557 | case 'D': case 'd': state = BACKGROUND; break;
|
---|
558 | default: state = GROUND; break;
|
---|
559 | }
|
---|
560 | head = head + ch;
|
---|
561 | break;
|
---|
562 | case BACKGROUND:
|
---|
563 | switch(lch) {
|
---|
564 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
565 | case '=': state = BACKGROUND_EQUAL; break;
|
---|
566 | default: state = GROUND; break;
|
---|
567 | }
|
---|
568 | head = head + ch;
|
---|
569 | break;
|
---|
570 | case BACKGROUND_EQUAL:
|
---|
571 | switch(lch) {
|
---|
572 | case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
|
---|
573 | case '\'': case '\"': state = BACKGROUND_Q; break;
|
---|
574 | default: state = BACKGROUND_NQ; break;
|
---|
575 | }
|
---|
576 | if(state == BACKGROUND_NQ) {
|
---|
577 | url_str = url_str + ch;
|
---|
578 | } else {
|
---|
579 | head = head + ch;
|
---|
580 | }
|
---|
581 | break;
|
---|
582 | case BACKGROUND_NQ:
|
---|
583 | switch(lch) {
|
---|
584 | case ' ': case '\t': case '\n': case '\r':
|
---|
585 | state = BACKGROUND_FINAL;
|
---|
586 | tail = ch + tail;
|
---|
587 | break;
|
---|
588 | default: url_str = url_str + ch; break;
|
---|
589 | }
|
---|
590 | break;
|
---|
591 | case BACKGROUND_Q:
|
---|
592 | switch(lch) {
|
---|
593 | case '\'': case '\"':
|
---|
594 | state = BACKGROUND_FINAL;
|
---|
595 | tail = ch + tail;
|
---|
596 | break;
|
---|
597 | default: url_str = url_str + ch; break;
|
---|
598 | }
|
---|
599 | break;
|
---|
600 | }
|
---|
601 | }
|
---|
602 |
|
---|
603 | if(state == HREF_FINAL ) {
|
---|
604 | try {
|
---|
605 | URL new_url = new URL(url, url_str);
|
---|
606 | href_links.addElement(new_url);
|
---|
607 | }
|
---|
608 | catch (Exception e) {
|
---|
609 | e.printStackTrace();
|
---|
610 | }
|
---|
611 | }
|
---|
612 |
|
---|
613 | if(state == SRC_FINAL ) {
|
---|
614 | try {
|
---|
615 | URL new_url = new URL(url, url_str);
|
---|
616 | src_links.addElement(new_url);
|
---|
617 | }
|
---|
618 | catch (Exception e) {
|
---|
619 | e.printStackTrace();
|
---|
620 | }
|
---|
621 | }
|
---|
622 |
|
---|
623 | if(state == LINK_FINAL ) {
|
---|
624 | try {
|
---|
625 | URL new_url = new URL(url, url_str);
|
---|
626 | link_links.addElement(new_url);
|
---|
627 | }
|
---|
628 | catch (Exception e) {
|
---|
629 | e.printStackTrace();
|
---|
630 | }
|
---|
631 | }
|
---|
632 |
|
---|
633 | if(state == BACKGROUND_FINAL ) {
|
---|
634 | try {
|
---|
635 | URL new_url = new URL(url, url_str);
|
---|
636 | background_links.addElement(new_url);
|
---|
637 | }
|
---|
638 | catch (Exception e) {
|
---|
639 | e.printStackTrace();
|
---|
640 | }
|
---|
641 | }
|
---|
642 |
|
---|
643 | return head + url + tail;
|
---|
644 | }
|
---|
645 |
|
---|
646 | static private String guessContentType(String text) {
|
---|
647 | if(text.endsWith("/")) {
|
---|
648 | return "text/html";
|
---|
649 | } else if (text.endsWith(".html")) {
|
---|
650 | return "text/html";
|
---|
651 | } else if (text.endsWith(".htm")) {
|
---|
652 | return "text/html";
|
---|
653 | } else if (text.indexOf("?")>0) {
|
---|
654 | return "text/html";
|
---|
655 | }
|
---|
656 | return "image/jpeg";
|
---|
657 | }
|
---|
658 |
|
---|
659 | }
|
---|