source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/CURL.java@ 6816

Last change on this file since 6816 was 6816, checked in by mdewsnip, 20 years ago

The GsdlCollageApplet: a classifier that displays a collage of the images in a collection. By Katrina Edgar (kde2).

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.io.*;
4import java.net.*;
5import java.util.*;
6
7/** Examines html pages and extracts all the images and links */
8public class CURL {
9
10 private boolean url_valid = true;
11 private InputStream input = null;
12 private int peek_value = -1;
13 private String buffer = "";
14 private URL url = null;
15 private Vector href_links = null;
16 private Vector src_links = null;
17 private Vector link_links = null;
18 private Vector background_links = null;
19
20 /** Starts processing the given url for images and links
21 * @param url_str The url to examine */
22 public CURL(String url_str) {
23 href_links = new Vector();
24 src_links = new Vector();
25 link_links = new Vector();
26 background_links = new Vector();
27
28 try {
29 url = new URL(url_str);
30 input = url.openStream();
31 }
32 catch (MalformedURLException e) {
33 url_valid = false;
34 }
35 catch (IOException e) {
36 url_valid = false;
37 }
38 }
39
40 /** Checks that a valid connection to the url has been made */
41 public boolean connected_ok()
42 {
43 return url_valid;
44 }
45
46 /** Gets any href links from this url
47 * @return Vector of href links */
48 public Vector getHrefLinks() {
49 return href_links;
50 }
51 /** Gets any source links from this url
52 * @return Vector of source links */
53 public Vector getSrcLinks() {
54 return src_links;
55 }
56 /** Gets any other links from this url
57 * @return Vector of other links */
58 public Vector getLinkLinks() {
59 return link_links;
60 }
61 /** Gets any background links from this url
62 * @return Vector of background links */
63 public Vector getBackgroundLinks() {
64 return background_links;
65 }
66
67 /** Gets the url currently being processed */
68 public URL getURL() {
69 return url;
70 }
71 /** Checks that the content of the url is in html */
72 public boolean isHTML() {
73 String content_type = guessContentType(url.toString());
74
75 if(content_type.startsWith("text/html")) {
76 return true;
77 }
78 return false;
79 }
80
81 /** Reads a value from the buffer
82 * @return Value read if successful and -1 if not */
83 public int read() {
84 int value = -1;
85 if(isHTML()) {
86 if(buffer.length() == 0) {
87 refill();
88 }
89 if(buffer.length() != 0) {
90 value = getBuffer();
91 }
92 } else {
93 value = getRaw();
94 }
95 return value;
96 }
97 /** Reads the entire URL */
98 public void readAll() {
99 int value;
100 while((value = read()) != -1) {
101 }
102 }
103
104 // Gets the head of the buffered buffer.
105 private int getBuffer() {
106 if(buffer.length() > 0) {
107 int value = buffer.charAt(0);
108 buffer = buffer.substring(1, buffer.length());
109 return value;
110 } else {
111 System.err.println("Called getRaw on an empty string");
112 return -1;
113 }
114 }
115 // Gets the head of the raw buffer.
116 private int getRaw() {
117 int value = -1;
118 if(peek_value != -1) {
119 value = peek_value;
120 peek_value = -1;
121 } else {
122 try {
123 value = input.read();
124 } catch (Exception e) {
125 e.printStackTrace();
126 }
127 }
128 return value;
129 }
130
131 private int peekRaw() {
132 if(peek_value == -1) {
133 peek_value = getRaw();
134 }
135 return peek_value;
136 }
137
138 // Refills the buffered buffer with the next tag or non-tag block
139 // The tag is checked for urls. Note a tag is taken to be < .. > or
140 // < .. < so comments are supported, but comment blocks are still
141 // scanned.
142 private void refill() {
143 int value = getRaw();
144 if(value != -1) {
145 if(value == '<') {
146 //System.err.println("Parsing a tag starting " + (char)value);
147 // Add opening < to buffer
148 setBuffer(value);
149 String tag = "";
150 value = getRaw();
151 while(value != -1 && peekRaw() != '<' && value != '>') {
152 //System.err.println("Read a " + (char)value);
153 tag = tag + (char) value;
154 value = getRaw();
155 }
156 //System.err.println("Read a " + (char)value);
157 //tag = smartLower(tag);
158 tag = findURL(tag);
159 buffer = buffer + tag;
160 // Add closing > to buffer
161 setBuffer(value);
162 //System.err.println("Finished tag");
163 } else {
164 //System.err.println("Parsing content");
165 //System.err.println("Value = " + value + " = '" + (char)value + "'");
166 while(value != -1 && value != '<') {
167 //System.err.println("Read a '" + (char)value + "'");
168 setBuffer(value);
169 value = getRaw();
170 }
171 // If we've accidently read the '<' push it back in the stream by
172 // setting peek_value to value. Since the peek_value will be returned
173 // on the next read this has the desired effect.
174 if(value == '<') {
175 peek_value = value;
176 }
177 //System.err.println("Read " + buffer);
178 //System.err.println("Finished Content");
179 }
180 }
181 }
182
183 // Sets the tail of the buffered buffer.
184 private void setBuffer(int value) {
185 buffer = buffer + (char) value;
186 }
187
188 private String smartLower(String tag) {
189 boolean lower = true;
190 String new_tag = "";
191 for(int i = 0; i < tag.length(); i++) {
192 // Disable case lowering for value tags (bound by "")
193 if (tag.charAt(i) == '"') {
194 if(lower) {
195 lower = false;
196 } else {
197 lower = true;
198 }
199 }
200 // Lower everything else
201 if(lower) {
202 new_tag = new_tag + Character.toLowerCase(tag.charAt(i));
203 } else {
204 new_tag = new_tag + tag.charAt(i);
205 }
206 }
207 return new_tag;
208 }
209
210 private final static int GROUND = 0;
211 private final static int COMMENT = 5;
212 private final static int COMMENT_DASH = 6;
213 private final static int COMMENT_FINAL = 7;
214 private final static int H = 11;
215 private final static int HR = 12;
216 private final static int HRE = 13;
217 private final static int HREF = 14;
218 private final static int HREF_EQUAL = 15;
219 private final static int HREF_Q = 16;
220 private final static int HREF_NQ = 17;
221 private final static int HREF_FINAL = 18;
222 private final static int S = 23;
223 private final static int SR = 24;
224 private final static int SRC = 25;
225 private final static int SRC_EQUAL = 26;
226 private final static int SRC_Q = 27;
227 private final static int SRC_NQ = 28;
228 private final static int SRC_FINAL = 29;
229 private final static int L = 67;
230 private final static int LI = 68;
231 private final static int LIN = 69;
232 private final static int LINK = 70;
233 private final static int LINK_QUOTE = 72;
234 private final static int LINK_H = 73;
235 private final static int LINK_HR = 74;
236 private final static int LINK_HRE = 75;
237 private final static int LINK_HREF = 76;
238 private final static int LINK_EQUAL = 77;
239 private final static int LINK_Q = 78;
240 private final static int LINK_NQ = 79;
241 private final static int LINK_FINAL = 80;
242 private final static int B = 85;
243 private final static int BA = 86;
244 private final static int BAC = 87;
245 private final static int BACK = 88;
246 private final static int BACKG = 89;
247 private final static int BACKGR = 90;
248 private final static int BACKGRO = 91;
249 private final static int BACKGROU = 92;
250 private final static int BACKGROUN = 93;
251 private final static int BACKGROUND = 94;
252 private final static int BACKGROUND_EQUAL = 95;
253 private final static int BACKGROUND_Q = 96;
254 private final static int BACKGROUND_NQ = 97;
255 private final static int BACKGROUND_FINAL = 98;
256 private final static int FINAL = 99;
257
258 // Run the finite-state machine on a buffer-load.
259 private String findURL(String tail) {
260 int state = GROUND;
261 String head = "";
262 String url_str = "";
263 // Sift through the tag for urls
264 while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) {
265 char ch = tail.charAt(0);
266 String sch = "" + ch;
267 sch = sch.toLowerCase();
268 char lch = sch.charAt(0);
269 tail = tail.substring(1);
270 switch (state) {
271 // Initial state.
272 case GROUND:
273 switch (lch) {
274 case '!': state = COMMENT; break;
275 case 'B': case 'b': state = B; break;
276 case 'H': case 'h': state = H; break;
277 case 'L': case 'l': state = L; break;
278 case 'S': case 's': state = S; break;
279 }
280 head = head + ch;
281 break;
282 // A possible comment
283 case COMMENT:
284 switch(lch) {
285 case '-': state = COMMENT_DASH; break;
286 default: state = GROUND; break;
287 }
288 head = head + ch;
289 break;
290 case COMMENT_DASH:
291 switch(lch) {
292 case '-': state = COMMENT_FINAL; break; // skip comments
293 default: state = GROUND; break;
294 }
295 head = head + ch;
296 break;
297 // A possible href
298 case H:
299 switch(lch) {
300 case 'R': case 'r': state = HR; break;
301 default: state = GROUND; break;
302 }
303 head = head + ch;
304 break;
305 case HR:
306 switch(lch) {
307 case 'E': case 'e': state = HRE; break;
308 default: state = GROUND; break;
309 }
310 head = head + ch;
311 break;
312 case HRE:
313 switch(lch) {
314 case 'F': case 'f': state = HREF; break;
315 default: state = GROUND; break;
316 }
317 head = head + ch;
318 break;
319 case HREF:
320 switch(lch) {
321 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
322 case '=': state = HREF_EQUAL; break;
323 default: state = GROUND; break;
324 }
325 head = head + ch;
326 break;
327 case HREF_EQUAL:
328 switch(lch) {
329 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
330 case '\'': case '\"': state = HREF_Q; break;
331 default: state = HREF_NQ; break;
332 }
333 if(state == HREF_NQ) {
334 url_str = url_str + ch;
335 } else {
336 head = head + ch;
337 }
338 break;
339 case HREF_NQ:
340 switch(lch) {
341 case ' ': case '\t': case '\n': case '\r':
342 state = HREF_FINAL;
343 tail = ch + tail;
344 break;
345 default: url_str = url_str + ch; break;
346 }
347 break;
348 case HREF_Q:
349 switch(lch) {
350 case '\'': case '\"':
351 state = HREF_FINAL;
352 tail = ch + tail;
353 break;
354 default: url_str = url_str + ch; break;
355 }
356 break;
357 // A possible src
358 case S:
359 switch(lch) {
360 case 'R': case 'r': state = SR; break;
361 default: state = GROUND; break;
362 }
363 head = head + ch;
364 break;
365 case SR:
366 switch(lch) {
367 case 'C': case 'c': state = SRC; break;
368 default: state = GROUND; break;
369 }
370 head = head + ch;
371 break;
372 case SRC:
373 switch(lch) {
374 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
375 case '=': state = SRC_EQUAL; break;
376 default: state = GROUND; break;
377 }
378 head = head + ch;
379 break;
380 case SRC_EQUAL:
381 switch(lch) {
382 case ' ': case '\t': case '\n': case '\r': break; // Skip spaces
383 case '\'': case '\"': state = SRC_Q; break;
384 default: state = SRC_NQ; break;
385 }
386 if(state == SRC_NQ) {
387 url_str = url_str + ch;
388 } else {
389 head = head + ch;
390 }
391 break;
392 case SRC_NQ:
393 switch(lch) {
394 case ' ': case '\t': case '\n': case '\r':
395 state = SRC_FINAL;
396 tail = ch + tail;
397 break;
398 default: url_str = url_str + ch; break;
399 }
400 break;
401 case SRC_Q:
402 switch(lch) {
403 case '\'': case '\"':
404 state = SRC_FINAL;
405 tail = ch + tail;
406 break;
407 default: url_str = url_str + ch; break;
408 }
409 break;
410 // A possible link-src combo
411 case L:
412 switch(lch) {
413 case 'I': case 'i': state = LI; break;
414 default: state = GROUND; break;
415 }
416 head = head + ch;
417 break;
418 case LI:
419 switch(lch) {
420 case 'N': case 'n': state = LIN; break;
421 default: state = GROUND; break;
422 }
423 head = head + ch;
424 break;
425 case LIN:
426 switch(lch) {
427 case 'K': case 'k': state = LINK; break;
428 default: state = GROUND; break;
429 }
430 head = head + ch;
431 break;
432 case LINK:
433 switch(lch) {
434 case 'H': case 'h': state = LINK_H; break;
435 default: state = LINK; break;
436 }
437 head = head + ch;
438 break;
439 case LINK_H:
440 switch(lch) {
441 case 'R': case 'r': state = LINK_HR; break;
442 default: state = LINK; break;
443 }
444 head = head + ch;
445 break;
446 case LINK_HR:
447 switch(lch) {
448 case 'E': case 'e': state = LINK_HRE; break;
449 default: state = LINK; break;
450 }
451 head = head + ch;
452 break;
453 case LINK_HRE:
454 switch(lch) {
455 case 'F': case 'f': state = LINK_HREF; break;
456 default: state = LINK; break;
457 }
458 head = head + ch;
459 break;
460 case LINK_HREF:
461 switch(lch) {
462 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
463 case '=': state = LINK_EQUAL; break;
464 default: state = GROUND; break;
465 }
466 head = head + ch;
467 break;
468 case LINK_EQUAL:
469 switch(lch) {
470 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
471 case '\'': case '\"': state = LINK_Q; break;
472 default: state = LINK_NQ; break;
473 }
474 if(state == LINK_NQ) {
475 url_str = url_str + ch;
476 } else {
477 head = head + ch;
478 }
479 break;
480 case LINK_NQ:
481 switch(lch) {
482 case ' ': case '\t': case '\n': case '\r':
483 state = LINK_FINAL;
484 tail = ch + tail;
485 break;
486 default: url_str = url_str + ch; break;
487 }
488 break;
489 case LINK_Q:
490 switch(lch) {
491 case '\'': case '\"':
492 state = LINK_FINAL;
493 tail = ch + tail;
494 break;
495 default: url_str = url_str + ch; break;
496 }
497 break;
498 // A possible background
499 case B:
500 switch(lch) {
501 case 'A': case 'a': state = BA; break;
502 default: state = GROUND; break;
503 }
504 head = head + ch;
505 break;
506 case BA:
507 switch(lch) {
508 case 'C': case 'c': state = BAC; break;
509 default: state = GROUND; break;
510 }
511 head = head + ch;
512 break;
513 case BAC:
514 switch(lch) {
515 case 'K': case 'k': state = BACK; break;
516 default: state = GROUND; break;
517 }
518 head = head + ch;
519 break;
520 case BACK:
521 switch(lch) {
522 case 'G': case 'g': state = BACKG; break;
523 default: state = GROUND; break;
524 }
525 head = head + ch;
526 break;
527 case BACKG:
528 switch(lch) {
529 case 'R': case 'r': state = BACKGR; break;
530 default: state = GROUND; break;
531 }
532 head = head + ch;
533 break;
534 case BACKGR:
535 switch(lch) {
536 case 'O': case 'o': state = BACKGRO; break;
537 default: state = GROUND; break;
538 }
539 head = head + ch;
540 break;
541 case BACKGRO:
542 switch(lch) {
543 case 'U': case 'u': state = BACKGROU; break;
544 default: state = GROUND; break;
545 }
546 head = head + ch;
547 break;
548 case BACKGROU:
549 switch(lch) {
550 case 'N': case 'n': state = BACKGROUN; break;
551 default: state = GROUND; break;
552 }
553 head = head + ch;
554 break;
555 case BACKGROUN:
556 switch(lch) {
557 case 'D': case 'd': state = BACKGROUND; break;
558 default: state = GROUND; break;
559 }
560 head = head + ch;
561 break;
562 case BACKGROUND:
563 switch(lch) {
564 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
565 case '=': state = BACKGROUND_EQUAL; break;
566 default: state = GROUND; break;
567 }
568 head = head + ch;
569 break;
570 case BACKGROUND_EQUAL:
571 switch(lch) {
572 case ' ': case '\t': case '\n': case '\r': break; // Skip blanks
573 case '\'': case '\"': state = BACKGROUND_Q; break;
574 default: state = BACKGROUND_NQ; break;
575 }
576 if(state == BACKGROUND_NQ) {
577 url_str = url_str + ch;
578 } else {
579 head = head + ch;
580 }
581 break;
582 case BACKGROUND_NQ:
583 switch(lch) {
584 case ' ': case '\t': case '\n': case '\r':
585 state = BACKGROUND_FINAL;
586 tail = ch + tail;
587 break;
588 default: url_str = url_str + ch; break;
589 }
590 break;
591 case BACKGROUND_Q:
592 switch(lch) {
593 case '\'': case '\"':
594 state = BACKGROUND_FINAL;
595 tail = ch + tail;
596 break;
597 default: url_str = url_str + ch; break;
598 }
599 break;
600 }
601 }
602
603 if(state == HREF_FINAL ) {
604 try {
605 URL new_url = new URL(url, url_str);
606 href_links.addElement(new_url);
607 }
608 catch (Exception e) {
609 e.printStackTrace();
610 }
611 }
612
613 if(state == SRC_FINAL ) {
614 try {
615 URL new_url = new URL(url, url_str);
616 src_links.addElement(new_url);
617 }
618 catch (Exception e) {
619 e.printStackTrace();
620 }
621 }
622
623 if(state == LINK_FINAL ) {
624 try {
625 URL new_url = new URL(url, url_str);
626 link_links.addElement(new_url);
627 }
628 catch (Exception e) {
629 e.printStackTrace();
630 }
631 }
632
633 if(state == BACKGROUND_FINAL ) {
634 try {
635 URL new_url = new URL(url, url_str);
636 background_links.addElement(new_url);
637 }
638 catch (Exception e) {
639 e.printStackTrace();
640 }
641 }
642
643 return head + url + tail;
644 }
645
646 static private String guessContentType(String text) {
647 if(text.endsWith("/")) {
648 return "text/html";
649 } else if (text.endsWith(".html")) {
650 return "text/html";
651 } else if (text.endsWith(".htm")) {
652 return "text/html";
653 } else if (text.indexOf("?")>0) {
654 return "text/html";
655 }
656 return "image/jpeg";
657 }
658
659}
Note: See TracBrowser for help on using the repository browser.