package org.nzdl.gsdl.GsdlCollageApplet; import java.io.*; import java.net.*; import java.util.*; /** Examines html pages and extracts all the images and links */ public class CURL { private boolean url_valid = true; private InputStream input = null; private int peek_value = -1; private String buffer = ""; private URL url = null; private Vector href_links = null; private Vector src_links = null; private Vector link_links = null; private Vector background_links = null; /** Starts processing the given url for images and links * @param url_str The url to examine */ public CURL(String url_str) { href_links = new Vector(); src_links = new Vector(); link_links = new Vector(); background_links = new Vector(); try { url = new URL(url_str); input = url.openStream(); // int value = 0; // while ( (value=input.read())!=-1){ //System.err.print((char)value); //} //url = new URL(url_str); //input = url.openStream(); } catch (MalformedURLException e) { url_valid = false; } catch (IOException e) { url_valid = false; } } /** Checks that a valid connection to the url has been made */ public boolean connected_ok() { return url_valid; } /** Gets any href links from this url * @return Vector of href links */ public Vector getHrefLinks() { return href_links; } /** Gets any source links from this url * @return Vector of source links */ public Vector getSrcLinks() { return src_links; } /** Gets any other links from this url * @return Vector of other links */ public Vector getLinkLinks() { return link_links; } /** Gets any background links from this url * @return Vector of background links */ public Vector getBackgroundLinks() { return background_links; } /** Gets the url currently being processed */ public URL getURL() { return url; } /** Checks that the content of the url is in html */ public boolean isHTML() { String content_type = guessContentType(url.toString()); if(content_type.startsWith("text/html")) { return true; } return false; } /** Reads a value from the buffer * @return Value read if successful and -1 if not */ public int read() { int value = -1; if(isHTML()) { if(buffer.length() == 0) { refill(); } if(buffer.length() != 0) { value = getBuffer(); } } else { value = getRaw(); } return value; } /** Reads the entire URL */ public void readAll() { int value; while((value = read()) != -1) { } } // Gets the head of the buffered buffer. private int getBuffer() { if(buffer.length() > 0) { int value = buffer.charAt(0); buffer = buffer.substring(1, buffer.length()); return value; } else { System.err.println("Called getRaw on an empty string"); return -1; } } // Gets the head of the raw buffer. private int getRaw() { int value = -1; if(peek_value != -1) { value = peek_value; peek_value = -1; } else { try { value = input.read(); } catch (Exception e) { e.printStackTrace(); } } return value; } private int peekRaw() { if(peek_value == -1) { peek_value = getRaw(); } return peek_value; } // Refills the buffered buffer with the next tag or non-tag block // The tag is checked for urls. Note a tag is taken to be < .. > or // < .. < so comments are supported, but comment blocks are still // scanned. private void refill() { int value = getRaw(); if(value != -1) { if(value == '<') { //System.err.println("Parsing a tag starting " + (char)value); // Add opening < to buffer setBuffer(value); String tag = ""; value = getRaw(); while(value != -1 && peekRaw() != '<' && value != '>') { //System.err.print((char)value); tag = tag + (char) value; value = getRaw(); } //System.err.println("Read a " + (char)value); //tag = smartLower(tag); //System.err.println("tag "+tag); tag = findURL(tag); buffer = buffer + tag; // Add closing > to buffer setBuffer(value); //System.err.println("Finished tag"); } else { //System.err.println("Parsing content"); //System.err.println("Value = " + value + " = '" + (char)value + "'"); while(value != -1 && value != '<') { //System.err.println("Read a '" + (char)value + "'"); setBuffer(value); value = getRaw(); } // If we've accidently read the '<' push it back in the stream by // setting peek_value to value. Since the peek_value will be returned // on the next read this has the desired effect. if(value == '<') { peek_value = value; } //System.err.println("Read " + buffer); //System.err.println("Finished Content"); } } } // Sets the tail of the buffered buffer. private void setBuffer(int value) { buffer = buffer + (char) value; } private String smartLower(String tag) { boolean lower = true; String new_tag = ""; for(int i = 0; i < tag.length(); i++) { // Disable case lowering for value tags (bound by "") if (tag.charAt(i) == '"') { if(lower) { lower = false; } else { lower = true; } } // Lower everything else if(lower) { new_tag = new_tag + Character.toLowerCase(tag.charAt(i)); } else { new_tag = new_tag + tag.charAt(i); } } return new_tag; } private final static int GROUND = 0; private final static int COMMENT = 5; private final static int COMMENT_DASH = 6; private final static int COMMENT_FINAL = 7; private final static int H = 11; private final static int HR = 12; private final static int HRE = 13; private final static int HREF = 14; private final static int HREF_EQUAL = 15; private final static int HREF_Q = 16; private final static int HREF_NQ = 17; private final static int HREF_FINAL = 18; private final static int S = 23; private final static int SR = 24; private final static int SRC = 25; private final static int SRC_EQUAL = 26; private final static int SRC_Q = 27; private final static int SRC_NQ = 28; private final static int SRC_FINAL = 29; private final static int L = 67; private final static int LI = 68; private final static int LIN = 69; private final static int LINK = 70; private final static int LINK_QUOTE = 72; private final static int LINK_H = 73; private final static int LINK_HR = 74; private final static int LINK_HRE = 75; private final static int LINK_HREF = 76; private final static int LINK_EQUAL = 77; private final static int LINK_Q = 78; private final static int LINK_NQ = 79; private final static int LINK_FINAL = 80; private final static int B = 85; private final static int BA = 86; private final static int BAC = 87; private final static int BACK = 88; private final static int BACKG = 89; private final static int BACKGR = 90; private final static int BACKGRO = 91; private final static int BACKGROU = 92; private final static int BACKGROUN = 93; private final static int BACKGROUND = 94; private final static int BACKGROUND_EQUAL = 95; private final static int BACKGROUND_Q = 96; private final static int BACKGROUND_NQ = 97; private final static int BACKGROUND_FINAL = 98; private final static int FINAL = 99; // Run the finite-state machine on a buffer-load. private String findURL(String tail) { int state = GROUND; String head = ""; String url_str = ""; // Sift through the tag for urls while(tail.length() > 0 && state != BACKGROUND_FINAL && state != COMMENT_FINAL && state != HREF_FINAL && state != LINK_FINAL && state != SRC_FINAL) { char ch = tail.charAt(0); String sch = "" + ch; sch = sch.toLowerCase(); char lch = sch.charAt(0); tail = tail.substring(1); switch (state) { // Initial state. case GROUND: switch (lch) { case '!': state = COMMENT; break; case 'B': case 'b': state = B; break; case 'H': case 'h': state = H; break; case 'L': case 'l': state = L; break; case 'S': case 's': state = S; break; } head = head + ch; break; // A possible comment case COMMENT: switch(lch) { case '-': state = COMMENT_DASH; break; default: state = GROUND; break; } head = head + ch; break; case COMMENT_DASH: switch(lch) { case '-': state = COMMENT_FINAL; break; // skip comments default: state = GROUND; break; } head = head + ch; break; // A possible href case H: switch(lch) { case 'R': case 'r': state = HR; break; default: state = GROUND; break; } head = head + ch; break; case HR: switch(lch) { case 'E': case 'e': state = HRE; break; default: state = GROUND; break; } head = head + ch; break; case HRE: switch(lch) { case 'F': case 'f': state = HREF; break; default: state = GROUND; break; } head = head + ch; break; case HREF: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '=': state = HREF_EQUAL; break; default: state = GROUND; break; } head = head + ch; break; case HREF_EQUAL: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '\'': case '\"': state = HREF_Q; break; default: state = HREF_NQ; break; } if(state == HREF_NQ) { url_str = url_str + ch; } else { head = head + ch; } break; case HREF_NQ: switch(lch) { case ' ': case '\t': case '\n': case '\r': state = HREF_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; case HREF_Q: switch(lch) { case '\'': case '\"': state = HREF_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; // A possible src case S: switch(lch) { case 'R': case 'r': state = SR; break; default: state = GROUND; break; } head = head + ch; break; case SR: switch(lch) { case 'C': case 'c': state = SRC; break; default: state = GROUND; break; } head = head + ch; break; case SRC: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '=': state = SRC_EQUAL; break; default: state = GROUND; break; } head = head + ch; break; case SRC_EQUAL: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip spaces case '\'': case '\"': state = SRC_Q; break; default: state = SRC_NQ; break; } if(state == SRC_NQ) { url_str = url_str + ch; } else { head = head + ch; } break; case SRC_NQ: switch(lch) { case ' ': case '\t': case '\n': case '\r': state = SRC_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; case SRC_Q: switch(lch) { case '\'': case '\"': state = SRC_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; // A possible link-src combo case L: switch(lch) { case 'I': case 'i': state = LI; break; default: state = GROUND; break; } head = head + ch; break; case LI: switch(lch) { case 'N': case 'n': state = LIN; break; default: state = GROUND; break; } head = head + ch; break; case LIN: switch(lch) { case 'K': case 'k': state = LINK; break; default: state = GROUND; break; } head = head + ch; break; case LINK: switch(lch) { case 'H': case 'h': state = LINK_H; break; default: state = LINK; break; } head = head + ch; break; case LINK_H: switch(lch) { case 'R': case 'r': state = LINK_HR; break; default: state = LINK; break; } head = head + ch; break; case LINK_HR: switch(lch) { case 'E': case 'e': state = LINK_HRE; break; default: state = LINK; break; } head = head + ch; break; case LINK_HRE: switch(lch) { case 'F': case 'f': state = LINK_HREF; break; default: state = LINK; break; } head = head + ch; break; case LINK_HREF: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '=': state = LINK_EQUAL; break; default: state = GROUND; break; } head = head + ch; break; case LINK_EQUAL: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '\'': case '\"': state = LINK_Q; break; default: state = LINK_NQ; break; } if(state == LINK_NQ) { url_str = url_str + ch; } else { head = head + ch; } break; case LINK_NQ: switch(lch) { case ' ': case '\t': case '\n': case '\r': state = LINK_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; case LINK_Q: switch(lch) { case '\'': case '\"': state = LINK_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; // A possible background case B: switch(lch) { case 'A': case 'a': state = BA; break; default: state = GROUND; break; } head = head + ch; break; case BA: switch(lch) { case 'C': case 'c': state = BAC; break; default: state = GROUND; break; } head = head + ch; break; case BAC: switch(lch) { case 'K': case 'k': state = BACK; break; default: state = GROUND; break; } head = head + ch; break; case BACK: switch(lch) { case 'G': case 'g': state = BACKG; break; default: state = GROUND; break; } head = head + ch; break; case BACKG: switch(lch) { case 'R': case 'r': state = BACKGR; break; default: state = GROUND; break; } head = head + ch; break; case BACKGR: switch(lch) { case 'O': case 'o': state = BACKGRO; break; default: state = GROUND; break; } head = head + ch; break; case BACKGRO: switch(lch) { case 'U': case 'u': state = BACKGROU; break; default: state = GROUND; break; } head = head + ch; break; case BACKGROU: switch(lch) { case 'N': case 'n': state = BACKGROUN; break; default: state = GROUND; break; } head = head + ch; break; case BACKGROUN: switch(lch) { case 'D': case 'd': state = BACKGROUND; break; default: state = GROUND; break; } head = head + ch; break; case BACKGROUND: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '=': state = BACKGROUND_EQUAL; break; default: state = GROUND; break; } head = head + ch; break; case BACKGROUND_EQUAL: switch(lch) { case ' ': case '\t': case '\n': case '\r': break; // Skip blanks case '\'': case '\"': state = BACKGROUND_Q; break; default: state = BACKGROUND_NQ; break; } if(state == BACKGROUND_NQ) { url_str = url_str + ch; } else { head = head + ch; } break; case BACKGROUND_NQ: switch(lch) { case ' ': case '\t': case '\n': case '\r': state = BACKGROUND_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; case BACKGROUND_Q: switch(lch) { case '\'': case '\"': state = BACKGROUND_FINAL; tail = ch + tail; break; default: url_str = url_str + ch; break; } break; } } url_str = url_str.replaceAll("&","&"); if(state == HREF_FINAL ) { try { URL new_url = new URL(url, url_str); href_links.addElement(new_url); } catch (Exception e) { e.printStackTrace(); } } if(state == SRC_FINAL ) { try { URL new_url = new URL(url, url_str); src_links.addElement(new_url); } catch (Exception e) { e.printStackTrace(); } } if(state == LINK_FINAL ) { try { URL new_url = new URL(url, url_str); link_links.add(new_url); } catch (Exception e) { e.printStackTrace(); } } if(state == BACKGROUND_FINAL ) { try { URL new_url = new URL(url, url_str); background_links.add(new_url); } catch (Exception e) { e.printStackTrace(); } } return head + url + tail; } static private String guessContentType(String text) { if(text.endsWith("/")) { return "text/html"; } else if (text.endsWith(".html")) { return "text/html"; } else if (text.endsWith(".htm")) { return "text/html"; } else if (text.indexOf("?")>0) { return "text/html"; } return "image/jpeg"; } }