source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/CURL.java@ 11561

Last change on this file since 11561 was 11561, checked in by shaoqun, 18 years ago

changed back to use Vector collection

  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.io.*;
4import java.net.*;
5import java.util.*;
6
7/** Examines html pages and extracts all the images and links */
8public class CURL {
9
10 private boolean url_valid = true;
11 private InputStream input = null;
12 private int peek_value = -1;
13 private String buffer = "";
14 private URL url = null;
15 private Vector href_links = null;
16 private Vector src_links = null;
17 private Vector link_links = null;
18 private Vector background_links = null;
19
20 /** Starts processing the given url for images and links
21 * @param url_str The url to examine */
22 public CURL(String url_str) {
23 href_links = new Vector();
24 src_links = new Vector();
25 link_links = new Vector();
26 background_links = new Vector();
27
28 try {
29 url = new URL(url_str);
30 input = url.openStream();
31 // int value = 0;
32
33 // while ( (value=input.read())!=-1){
34 //System.err.print((char)value);
35 //}
36
37 //url = new URL(url_str);
38 //input = url.openStream();
39
40 }
41 catch (MalformedURLException e) {
42
43 url_valid = false;
44 }
45 catch (IOException e) {
46
47 url_valid = false;
48 }
49 }
50
51 /** Checks that a valid connection to the url has been made */
52 public boolean connected_ok()
53 {
54 return url_valid;
55 }
56
57 /** Gets any href links from this url
58 * @return Vector of href links */
59 public Vector getHrefLinks() {
60 return href_links;
61 }
62 /** Gets any source links from this url
63 * @return Vector of source links */
64 public Vector getSrcLinks() {
65 return src_links;
66 }
67 /** Gets any other links from this url
68 * @return Vector of other links */
69 public Vector getLinkLinks() {
70 return link_links;
71 }
72 /** Gets any background links from this url
73 * @return Vector of background links */
74 public Vector getBackgroundLinks() {
75 return background_links;
76 }
77
78 /** Gets the url currently being processed */
79 public URL getURL() {
80 return url;
81 }
82 /** Checks that the content of the url is in html */
83 public boolean isHTML() {
84
85 String content_type = guessContentType(url.toString());
86
87 if(content_type.startsWith("text/html")) {
88 return true;
89 }
90 return false;
91 }
92
93 /** Reads a value from the buffer
94 * @return Value read if successful and -1 if not */
95 public int read() {
96 int value = -1;
97 if(isHTML()) {
98 if(buffer.length() == 0) {
99 refill();
100 }
101 if(buffer.length() != 0) {
102 value = getBuffer();
103 }
104 } else {
105 value = getRaw();
106 }
107 return value;
108 }
109 /** Reads the entire URL */
110 public void readAll() {
111 int value;
112 while((value = read()) != -1) {
113 }
114 }
115
116 // Gets the head of the buffered buffer.
117 private int getBuffer() {
118 if(buffer.length() > 0) {
119 int value = buffer.charAt(0);
120 buffer = buffer.substring(1, buffer.length());
121 return value;
122 } else {
123 System.err.println("Called getRaw on an empty string");
124 return -1;
125 }
126 }
127 // Gets the head of the raw buffer.
128 private int getRaw() {
129 int value = -1;
130 if(peek_value != -1) {
131 value = peek_value;
132 peek_value = -1;
133 }
134 else {
135 try {
136 value = input.read();
137 } catch (Exception e) {
138 e.printStackTrace();
139 }
140 }
141 return value;
142 }
143
144 private int peekRaw() {
145 if(peek_value == -1) {
146 peek_value = getRaw();
147 }
148 return peek_value;
149 }
150
151 // Refills the buffered buffer with the next tag or non-tag block
152 // The tag is checked for urls. Note a tag is taken to be < .. > or
153 // < .. < so comments are supported, but comment blocks are still
154 // scanned.
155 private void refill() {
156 int value = getRaw();
157 if(value != -1) {
158
159 if(value == '<') {
160 //System.err.println("Parsing a tag starting " + (char)value);
161 // Add opening < to buffer
162 setBuffer(value);
163 String tag = "";
164 value = getRaw();
165 while(value != -1 && peekRaw() != '<' && value != '>') {
166 //System.err.print((char)value);
167 tag = tag + (char) value;
168 value = getRaw();
169 }
170
171 //System.err.println("Read a " + (char)value);
172 //tag = smartLower(tag);
173 //System.err.println("tag "+tag);
174 tag = findURL(tag);
175 buffer = buffer + tag;
176 // Add closing > to buffer
177 setBuffer(value);
178 //System.err.println("Finished tag");
179 } else {
180 //System.err.println("Parsing content");
181 //System.err.println("Value = " + value + " = '" + (char)value + "'");
182 while(value != -1 && value != '<') {
183 //System.err.println("Read a '" + (char)value + "'");
184 setBuffer(value);
185 value = getRaw();
186 }
187 // If we've accidently read the '<' push it back in the stream by
188 // setting peek_value to value. Since the peek_value will be returned
189 // on the next read this has the desired effect.
190 if(value == '<') {
191 peek_value = value;
192 }
193 //System.err.println("Read " + buffer);
194 //System.err.println("Finished Content");
195 }
196 }
197 }
198
199 // Sets the tail of the buffered buffer.
200 private void setBuffer(int value) {
201 buffer = buffer + (char) value;
202 }
203
204 private String smartLower(String tag) {
205 boolean lower = true;
206 String new_tag = "";
207 for(int i = 0; i < tag.length(); i++) {
208 // Disable case lowering for value tags (bound by "")
209 if (tag.charAt(i) == '"') {
210 if(lower) {
211 lower = false;
212 } else {
213 lower = true;
214 }
215 }
216 // Lower everything else
217 if(lower) {
218 new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
219 } else {
220 new_tag = new_tag + tag.charAt(i);
221 }
222 }
223 return new_tag;
224 }
225
226 private final static int GROUND = 0;
227 private final static int COMMENT = 5;
228 private final static int COMMENT_DASH = 6;
229 private final static int COMMENT_FINAL = 7;
230 private final static int H = 11;
231 private final static int HR = 12;
232 private final static int HRE = 13;
233 private final static int HREF = 14;
234 private final static int HREF_EQUAL = 15;
235 private final static int HREF_Q = 16;
236 private final static int HREF_NQ = 17;
237 private final static int HREF_FINAL = 18;
238 private final static int S = 23;
239 private final static int SR = 24;
240 private final static int SRC = 25;
241 private final static int SRC_EQUAL = 26;
242 private final static int SRC_Q = 27;
243 private final static int SRC_NQ = 28;
244 private final static int SRC_FINAL = 29;
245 private final static int L = 67;
246 private final static int LI = 68;
247 private final static int LIN = 69;
248 private final static int LINK = 70;
249 private final static int LINK_QUOTE = 72;
250 private final static int LINK_H = 73;
251 private final static int LINK_HR = 74;
252 private final static int LINK_HRE = 75;
253 private final static int LINK_HREF = 76;
254 private final static int LINK_EQUAL = 77;
255 private final static int LINK_Q = 78;
256 private final static int LINK_NQ = 79;
257 private final static int LINK_FINAL = 80;
258 private final static int B = 85;
259 private final static int BA = 86;
260 private final static int BAC = 87;
261 private final static int BACK = 88;
262 private final static int BACKG = 89;
263 private final static int BACKGR = 90;
264 private final static int BACKGRO = 91;
265 private final static int BACKGROU = 92;
266 private final static int BACKGROUN = 93;
267 private final static int BACKGROUND = 94;
268 private final static int BACKGROUND_EQUAL = 95;
269 private final static int BACKGROUND_Q = 96;
270 private final static int BACKGROUND_NQ = 97;
271 private final static int BACKGROUND_FINAL = 98;
272 private final static int FINAL = 99;
273
274 // Run the finite-state machine on a buffer-load.
275 private String findURL(String tail) {
276 int state = GROUND;
277 String head = "";
278 String url_str = "";
279 // Sift through the tag for urls
280 while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
281 char ch = tail.charAt(0);
282 String sch = "" + ch;
283 sch = sch.toLowerCase();
284 char lch = sch.charAt(0);
285 tail = tail.substring(1);
286 switch (state) {
287 // Initial state.
288 case GROUND:
289 switch (lch) {
290 case '!': state = COMMENT; break;
291 case 'B': case 'b': state = B; break;
292 case 'H': case 'h': state = H; break;
293 case 'L': case 'l': state = L; break;
294 case 'S': case 's': state = S; break;
295 }
296 head = head + ch;
297 break;
298 // A possible comment
299 case COMMENT:
300 switch(lch) {
301 case '-': state = COMMENT_DASH; break;
302 default: state = GROUND; break;
303 }
304 head = head + ch;
305 break;
306 case COMMENT_DASH:
307 switch(lch) {
308 case '-': state = COMMENT_FINAL; break; // skip comments
309 default: state = GROUND; break;
310 }
311 head = head + ch;
312 break;
313 // A possible href
314 case H:
315 switch(lch) {
316 case 'R': case 'r': state = HR; break;
317 default: state = GROUND; break;
318 }
319 head = head + ch;
320 break;
321 case HR:
322 switch(lch) {
323 case 'E': case 'e': state = HRE; break;
324 default: state = GROUND; break;
325 }
326 head = head + ch;
327 break;
328 case HRE:
329 switch(lch) {
330 case 'F': case 'f': state = HREF; break;
331 default: state = GROUND; break;
332 }
333 head = head + ch;
334 break;
335 case HREF:
336 switch(lch) {
337 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
338 case '=': state = HREF_EQUAL; break;
339 default: state = GROUND; break;
340 }
341 head = head + ch;
342 break;
343 case HREF_EQUAL:
344 switch(lch) {
345 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
346 case '\'': case '\"': state = HREF_Q; break;
347 default: state = HREF_NQ; break;
348 }
349 if(state == HREF_NQ) {
350 url_str = url_str + ch;
351 } else {
352 head = head + ch;
353 }
354 break;
355 case HREF_NQ:
356 switch(lch) {
357 case ' ': case '\t': case '\n': case '\r':
358 state = HREF_FINAL;
359 tail = ch + tail;
360 break;
361 default: url_str = url_str + ch; break;
362 }
363 break;
364 case HREF_Q:
365 switch(lch) {
366 case '\'': case '\"':
367 state = HREF_FINAL;
368 tail = ch + tail;
369 break;
370 default: url_str = url_str + ch; break;
371 }
372 break;
373 // A possible src
374 case S:
375 switch(lch) {
376 case 'R': case 'r': state = SR; break;
377 default: state = GROUND; break;
378 }
379 head = head + ch;
380 break;
381 case SR:
382 switch(lch) {
383 case 'C': case 'c': state = SRC; break;
384 default: state = GROUND; break;
385 }
386 head = head + ch;
387 break;
388 case SRC:
389 switch(lch) {
390 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
391 case '=': state = SRC_EQUAL; break;
392 default: state = GROUND; break;
393 }
394 head = head + ch;
395 break;
396 case SRC_EQUAL:
397 switch(lch) {
398 case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
399 case '\'': case '\"': state = SRC_Q; break;
400 default: state = SRC_NQ; break;
401 }
402 if(state == SRC_NQ) {
403 url_str = url_str + ch;
404 } else {
405 head = head + ch;
406 }
407 break;
408 case SRC_NQ:
409 switch(lch) {
410 case ' ': case '\t': case '\n': case '\r':
411 state = SRC_FINAL;
412 tail = ch + tail;
413 break;
414 default: url_str = url_str + ch; break;
415 }
416 break;
417 case SRC_Q:
418 switch(lch) {
419 case '\'': case '\"':
420 state = SRC_FINAL;
421 tail = ch + tail;
422 break;
423 default: url_str = url_str + ch; break;
424 }
425 break;
426 // A possible link-src combo
427 case L:
428 switch(lch) {
429 case 'I': case 'i': state = LI; break;
430 default: state = GROUND; break;
431 }
432 head = head + ch;
433 break;
434 case LI:
435 switch(lch) {
436 case 'N': case 'n': state = LIN; break;
437 default: state = GROUND; break;
438 }
439 head = head + ch;
440 break;
441 case LIN:
442 switch(lch) {
443 case 'K': case 'k': state = LINK; break;
444 default: state = GROUND; break;
445 }
446 head = head + ch;
447 break;
448 case LINK:
449 switch(lch) {
450 case 'H': case 'h': state = LINK_H; break;
451 default: state = LINK; break;
452 }
453 head = head + ch;
454 break;
455 case LINK_H:
456 switch(lch) {
457 case 'R': case 'r': state = LINK_HR; break;
458 default: state = LINK; break;
459 }
460 head = head + ch;
461 break;
462 case LINK_HR:
463 switch(lch) {
464 case 'E': case 'e': state = LINK_HRE; break;
465 default: state = LINK; break;
466 }
467 head = head + ch;
468 break;
469 case LINK_HRE:
470 switch(lch) {
471 case 'F': case 'f': state = LINK_HREF; break;
472 default: state = LINK; break;
473 }
474 head = head + ch;
475 break;
476 case LINK_HREF:
477 switch(lch) {
478 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
479 case '=': state = LINK_EQUAL; break;
480 default: state = GROUND; break;
481 }
482 head = head + ch;
483 break;
484 case LINK_EQUAL:
485 switch(lch) {
486 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
487 case '\'': case '\"': state = LINK_Q; break;
488 default: state = LINK_NQ; break;
489 }
490 if(state == LINK_NQ) {
491 url_str = url_str + ch;
492 } else {
493 head = head + ch;
494 }
495 break;
496 case LINK_NQ:
497 switch(lch) {
498 case ' ': case '\t': case '\n': case '\r':
499 state = LINK_FINAL;
500 tail = ch + tail;
501 break;
502 default: url_str = url_str + ch; break;
503 }
504 break;
505 case LINK_Q:
506 switch(lch) {
507 case '\'': case '\"':
508 state = LINK_FINAL;
509 tail = ch + tail;
510 break;
511 default: url_str = url_str + ch; break;
512 }
513 break;
514 // A possible background
515 case B:
516 switch(lch) {
517 case 'A': case 'a': state = BA; break;
518 default: state = GROUND; break;
519 }
520 head = head + ch;
521 break;
522 case BA:
523 switch(lch) {
524 case 'C': case 'c': state = BAC; break;
525 default: state = GROUND; break;
526 }
527 head = head + ch;
528 break;
529 case BAC:
530 switch(lch) {
531 case 'K': case 'k': state = BACK; break;
532 default: state = GROUND; break;
533 }
534 head = head + ch;
535 break;
536 case BACK:
537 switch(lch) {
538 case 'G': case 'g': state = BACKG; break;
539 default: state = GROUND; break;
540 }
541 head = head + ch;
542 break;
543 case BACKG:
544 switch(lch) {
545 case 'R': case 'r': state = BACKGR; break;
546 default: state = GROUND; break;
547 }
548 head = head + ch;
549 break;
550 case BACKGR:
551 switch(lch) {
552 case 'O': case 'o': state = BACKGRO; break;
553 default: state = GROUND; break;
554 }
555 head = head + ch;
556 break;
557 case BACKGRO:
558 switch(lch) {
559 case 'U': case 'u': state = BACKGROU; break;
560 default: state = GROUND; break;
561 }
562 head = head + ch;
563 break;
564 case BACKGROU:
565 switch(lch) {
566 case 'N': case 'n': state = BACKGROUN; break;
567 default: state = GROUND; break;
568 }
569 head = head + ch;
570 break;
571 case BACKGROUN:
572 switch(lch) {
573 case 'D': case 'd': state = BACKGROUND; break;
574 default: state = GROUND; break;
575 }
576 head = head + ch;
577 break;
578 case BACKGROUND:
579 switch(lch) {
580 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
581 case '=': state = BACKGROUND_EQUAL; break;
582 default: state = GROUND; break;
583 }
584 head = head + ch;
585 break;
586 case BACKGROUND_EQUAL:
587 switch(lch) {
588 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
589 case '\'': case '\"': state = BACKGROUND_Q; break;
590 default: state = BACKGROUND_NQ; break;
591 }
592 if(state == BACKGROUND_NQ) {
593 url_str = url_str + ch;
594 } else {
595 head = head + ch;
596 }
597 break;
598 case BACKGROUND_NQ:
599 switch(lch) {
600 case ' ': case '\t': case '\n': case '\r':
601 state = BACKGROUND_FINAL;
602 tail = ch + tail;
603 break;
604 default: url_str = url_str + ch; break;
605 }
606 break;
607 case BACKGROUND_Q:
608 switch(lch) {
609 case '\'': case '\"':
610 state = BACKGROUND_FINAL;
611 tail = ch + tail;
612 break;
613 default: url_str = url_str + ch; break;
614 }
615 break;
616 }
617 }
618
619 url_str = url_str.replaceAll("&amp;","&");
620
621 if(state == HREF_FINAL ) {
622 try {
623 URL new_url = new URL(url, url_str);
624 href_links.addElement(new_url);
625 }
626 catch (Exception e) {
627 e.printStackTrace();
628 }
629 }
630
631 if(state == SRC_FINAL ) {
632 try {
633 URL new_url = new URL(url, url_str);
634 src_links.addElement(new_url);
635 }
636 catch (Exception e) {
637 e.printStackTrace();
638 }
639 }
640
641 if(state == LINK_FINAL ) {
642 try {
643 URL new_url = new URL(url, url_str);
644 link_links.add(new_url);
645 }
646 catch (Exception e) {
647 e.printStackTrace();
648 }
649 }
650
651 if(state == BACKGROUND_FINAL ) {
652 try {
653 URL new_url = new URL(url, url_str);
654 background_links.add(new_url);
655 }
656 catch (Exception e) {
657 e.printStackTrace();
658 }
659 }
660
661 return head + url + tail;
662 }
663
664 static private String guessContentType(String text) {
665 if(text.endsWith("/")) {
666 return "text/html";
667 } else if (text.endsWith(".html")) {
668 return "text/html";
669 } else if (text.endsWith(".htm")) {
670 return "text/html";
671 } else if (text.indexOf("?")>0) {
672 return "text/html";
673 }
674 return "image/jpeg";
675 }
676
677}
Note: See TracBrowser for help on using the repository browser.