package org.nzdl.gsdl.GsdlCollageApplet; import java.awt.*; import java.io.*; import java.net.*; import java.util.*; import javax.swing.ImageIcon; //**** /** * @author Katrina Edgar * @author David Bainbridge * * Controls retrieval of images from the specified starting url. Follows appropriate * links from this starting point, traversing in a tree-like state through several other * pages. Filters images and links based on specified parameters. Also controls the quantity * of downloading that occurs by restricting the number of downloaded images that are yet to * be displayed to 10, and the total number of downloads allowed is also restricted by * the applet application (to prevent downloading occuring infinitely). */ public class DownloadUrls extends Thread { /** Refers to applet */ GsdlCollageApplet app_ = null; /** Refers to download thread */ DownloadImages download_images_ = null; /** The address from which the application should start looking for images */ String starting_url_ = null; /** the root directory of Greenstone*/ String document_root_ = null; /** CHRIS - Holds the contents of the collection's assoc directory */ // File[] assocDir_ = null; /** Restricts links followed from the starting url to links that contain this string */ String href_musthave_ = null; /** Restricts links followed from the starting url to links that do not contain this string. * Also prevents image names from containing this string */ String image_mustnothave_ = null; /** Ignore images whose names begin with this string */ String image_ignore_ = null; /** Restricts the types of images included in the collage, for example jpg, gif, etc. */ String image_type_ = null; /** A static delay used when attempting to download more images into a full downloading buffer */ final int delay_ = 3000; /** The maximum number of images to have downloaded and not yet displayed */ final int buffer_size_ = 1; /** Used in cases where the image maps to a url outside of it's original location. * When used with Greenstone the collage images will refer to documents in the collections * from which the images are sourced. When used individually, the images may be saved into * a user directory and the pages they reference may be external hyperlinks. */ Hashtable external_links_ = null; /** Records all urls which have already been examined */ Hashtable visited_url_ = null; /** Determines whether there are still pages to examine and images to download */ boolean thread_running_ = true; int verbosity_ = 0; /** Records all images which have already been examined */ Hashtable visited_images_ = null; MediaTracker tracker; /** Constructor to initialise a download thread from which images are found, * saves parameters into local variables for use within the class. * * @param app reference to the applet * @param download_images class which stores the images retrieved in triplets * @param starting_url the url from which the search for images should begin * @param href_musthave restricts links to only those containing this string * @param image_mustnothave restricts links and image names to only those that don't contain this string * @param image_ignore restricts the beginning of image names * @param image_type restricts the type of images included in the collage to those named */ public DownloadUrls(GsdlCollageApplet app, DownloadImages download_images, String starting_url, String href_musthave, String image_mustnothave, String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk) { super("DownloadUrls"); app_ = app; download_images_ = download_images; starting_url_ = starting_url; href_musthave_ = href_musthave; image_mustnothave_ = image_mustnothave; image_ignore_ = image_ignore; image_type_ = image_type; document_root_ = document_root; verbosity_ = verbosity; tracker = trk; System.err.println("starting_url_ " + starting_url +"\n"+ "href_musthave_ " + href_musthave +"\n"+ "image_mustnothave_ " + image_mustnothave+"\n"+ "image_ignore_ "+ image_ignore+"\n"+ "image_type_ "+ image_type+"\n"+ "document root "+ document_root_ ); } /** Determines whether or not a url has already been examined * * @param url_string the url to check * @return true if the url has been visited, false if not */ public boolean already_visited(String url_string) { int hash_pos = url_string.indexOf("#"); if (hash_pos>0) { // strip off #anchor reference url_string = url_string.substring(0,hash_pos); } // if the url has been visited before, return true if (visited_url_.containsKey(url_string)) { if (verbosity_ > 3) { System.err.println("Visited " + url_string + " before!"); } return true; } visited_url_.put(url_string,"visited"); return false; } /** Determines whether or not an images or its screenview has been visited) * has already been examined * * @param url_string the url to check * @param img_name the image to check * @return true if the url has been visited, false if not */ public boolean image_visited(String url_string, String img_name) { String hash_dir = url_string.substring(0,url_string.lastIndexOf("/")); if ( visited_images_.containsKey(hash_dir)){ Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir); if (img_name.startsWith("screenview")){ return true; } if (hashed_images.containsKey(img_name)){ return true; } Enumeration enu = hashed_images.keys(); for(;enu.hasMoreElements();){ String name = (String)enu.nextElement(); if(name.startsWith("screenview")){ return true; } } hashed_images.put(img_name,"visited"); } else{ Hashtable hashed_images = new Hashtable(); hashed_images.put(img_name,"visited"); visited_images_.put(hash_dir,hashed_images); } return false; } /** Restricts the type of images that can be included in the collage * * @param url_string the url to check * @return true if the image is of a specified type, false if not */ public boolean image_file_extension(String url_string) { // lower case comparisons String url_lstring = url_string.toLowerCase(); if (image_type_ == null) return true; String tmp = image_type_; String original_image_type_ = image_type_; while (image_type_ != null && image_type_.indexOf("%") >= 0) { tmp = image_type_.substring(0, image_type_.indexOf("%")); if (image_type_.length() > image_type_.indexOf("%") + 1) image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length()); else image_type_ = null; if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) { image_type_ = original_image_type_; return true; } } if (image_type_ != null && url_lstring.endsWith(image_type_)) { image_type_ = original_image_type_; return true; } image_type_ = original_image_type_; return false; } /** Restricts images to only those that satisfy several specified conditions * regarding the content of the image name and url. * * @param url_string the url to check * @return true if the image is satisfactory, false if not */ public boolean filter_image(String url_string) { if (image_ignore_==null || !url_string.startsWith(image_ignore_)) { if (!already_visited(url_string)) { if (image_mustnothave_ != null) { String tmp = image_mustnothave_; String original_image_mustnothave_ = image_mustnothave_; while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) { tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%")); if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1) image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length()); else image_mustnothave_ = null; if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) { image_mustnothave_ = original_image_mustnothave_; return false; } } image_mustnothave_ = original_image_mustnothave_; if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) { image_mustnothave_ = original_image_mustnothave_; return false; } if (verbosity_ > 2) { System.err.println("src url = "+ url_string); } image_mustnothave_ = original_image_mustnothave_; } } } return true; } /** Restricts links to only those that satisfy several specified conditions * regarding the address of the link. * * @param url_string the url to check * @param new_url_string the url from which this link was found * @param depth the number of links followed on this path * @return true if the image is satisfactory, false if not */ public boolean filter_href(String url_string, String new_url_string, int depth) { boolean has_href = false; String tmp = href_musthave_; String original_href_musthave_ = href_musthave_; // checks that it does contain this content if (href_musthave_ != null) { while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) { tmp = href_musthave_.substring(0, href_musthave_.indexOf("%")); if (href_musthave_.length() > href_musthave_.indexOf("%") + 1) href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length()); else href_musthave_ = null; if (url_string.indexOf(tmp) >= 0) has_href = true; } if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0) has_href = true; href_musthave_ = original_href_musthave_; } tmp = image_mustnothave_; String original_image_mustnothave_ = image_mustnothave_; // checks that it doesn't contain this content if (image_mustnothave_ != null) { while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) { tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%")); if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1) image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length()); else image_mustnothave_ = null; if (url_string.indexOf(tmp) >= 0) has_href = false; } if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) has_href = false; image_mustnothave_ = original_image_mustnothave_; } // return true if the link is valid and false if not if (href_musthave_==null || has_href) { // might be another URL if (depth < app_.maxDepth()) { if (!new_url_string.startsWith(url_string)) { return true; } } } return false; } /** Adds an image to the stored downloaded images as a triplet. * Ensures that the number of images downloaded but not displayed at * any one time is controlled by using a buffer. If the buffer is * full this function will wait until space becomes available before * continuing. It also restricts the * total number of images to download as specified by the applet. * * @param url the image to download * @param from_url the url that this image was sourced from * @param img_name the name of the image */ public void add_image(URL url, String from_url, String img_name) { // get the image from the url if (verbosity_>=2) { System.err.println(" Downloading image URL: " + url.toString()); } if (image_visited(url.toString(),img_name)) return; int size = download_images_.downloadImage(tracker,url, from_url, img_name); try{ // if have completed the maximum number of downloads for the // application then stop if (size == app_.maxDownloads()) { stop(); } } catch (Exception e) { thread_running_ = false; stop(); e.printStackTrace(); } } /** Connects to the starting url and looks for all images and links from this * original page. Image links are processed first, so that any images found can be * downloaded immediately and placed on the applet. Secondly, the links to other * pages are recursively processed by this function and treated as a starting url * * @param new_url the url from which to start searching for images and links * @param depth the number of links that have been followed on this path */ public void rec_add_images(String new_url, int depth) { if (already_visited(new_url)) return; // check if there is a scenario where external hyperlinks are being used externalLinks(); String img_name = new String(); // connect to the url CURL curl = new CURL(new_url); if (curl.connected_ok()) { if (verbosity_ >= 1) { System.err.print("Connected OK ... "); } // read the page curl.readAll(); if (verbosity_ >= 1) { System.err.println("URL read."); } // get all the links into a vector Vector src_links = curl.getSrcLinks(); if (verbosity_ >= 2) { System.err.println(" Got src links... there are " + src_links.size() + " of them."); } // process each of the image links according to the parameters given. for (int i = 0; i < src_links.size(); i++) { URL url = (URL)src_links.get(i); String url_string = url.toString(); //System.err.println(" source links " + i + " [" + url_string +"]"); if (verbosity_ >= 4) { System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string); } if (image_file_extension(url_string)) { if (filter_image(url_string)) { img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length()); if (verbosity_ >= 2) { System.err.println(" Filtered: src_link[" + i + "] = " + url_string); } if ((external_links_ != null) && (!external_links_.isEmpty())) { String ext = (String) external_links_.get(img_name); if (ext != null){ add_image(url, ext, img_name); } else{ add_image(url, new_url, img_name); } } else { add_image(url, new_url, img_name); } } } } // get all the links into a vector Vector href_links = curl.getHrefLinks(); if (verbosity_ >= 2) { System.err.println(" Got href links... there are " + href_links.size() + " of them."); } // process each of the href links according to the parameters given. for (int i = 0; i < href_links.size(); i++) { URL url = (URL)href_links.get(i); String url_string = url.toString(); //System.err.println(" href links " + i + "[" + url_string +"]"); if (image_file_extension(url_string)) { if (filter_image(url_string)) { img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length()); if ((external_links_ != null) && (!external_links_.isEmpty())) { String ext = (String) external_links_.get(img_name); if (ext != null) add_image(url, ext, img_name); else add_image(url, new_url, img_name); } else { add_image(url, url_string, img_name); } } } else { if (filter_href(url_string,new_url,depth)) { rec_add_images(url_string,depth+1); } } } } else { System.err.println("Unable able to download "+new_url); } } /** Used in cases where the image maps to a url outside of it's original location. * When used with Greenstone the collage images will refer to documents in the collections * from which the images are sourced. When used individually, the images may be saved into * a user directory and the pages they reference may be external hyperlinks. * This function reads that external links file and creates a hash map of the image to * its external hyperlink. If the file does not exist the download thread will continue * and assume the first case, that links are internal. */ public void externalLinks() { external_links_ = null; try { if (starting_url_ == null || (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){ return; } // open a url to the file written URL u = new URL(starting_url_ + "externallinks"); BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream())); external_links_ = new Hashtable(); String l = r.readLine(); // split the line of the space, first part is the image, second part the link while (l != null) { String tmp1 = new String(); String tmp2 = new String(); if (l.indexOf(" ") >= 0) { tmp1 = l.substring(0, l.indexOf(" ")); if (l.length() > l.indexOf(" ") + 1) tmp2 = l.substring(l.indexOf(" ") + 1, l.length()); else tmp2 = null; if (tmp2 != null) { external_links_.put(tmp1, tmp2); //System.err.println(tmp1 + " " + tmp2); } } l = r.readLine(); } r.close(); } catch (Exception e) { e.printStackTrace(); return; } } /** Controls the download thread */ public void run () { System.err.println("Starting download thread."); visited_url_ = new Hashtable(); visited_images_ = new Hashtable(); rec_add_images(starting_url_,1); download_images_.stopDownload(); System.err.println("Download thread finished."); } }