/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * *

* * Author: John Thompson, Greenstone Digital Library, University of Waikato * *

* * Copyright (C) 1999 New Zealand Digital Library Project * *

* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * *

* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * *

* * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.collection; import java.awt.event.*; import java.io.*; import java.net.*; import java.util.*; import javax.swing.tree.*; import org.greenstone.gatherer.Configuration; import org.greenstone.gatherer.DebugStream; import org.greenstone.gatherer.Dictionary; import org.greenstone.gatherer.Gatherer; import org.greenstone.gatherer.WGet; import org.greenstone.gatherer.file.FileNode; import org.greenstone.gatherer.file.FileSystemModel; import org.greenstone.gatherer.file.WorkspaceTreeModel; import org.greenstone.gatherer.gui.DownloadProgressBar; import org.greenstone.gatherer.gui.tree.WorkspaceTree; import org.greenstone.gatherer.util.AppendLineOnlyFileDocument; import org.greenstone.gatherer.util.GURL; import org.greenstone.gatherer.util.SynchronizedTreeModelTools; import org.greenstone.gatherer.util.Utility; /** * @author John Thompson, Greenstone Digital Library, University of Waikato * @version 2.0 */ public class DownloadJob implements ActionListener { private boolean debug; private boolean higher_directories; private boolean no_parents; private boolean other_hosts; private boolean page_requisites; private boolean quiet; private AppendLineOnlyFileDocument download_log; private DownloadProgressBar progress; private GURL initial = null; private GURL url = null; // private TreeModel model; private int depth; private int previous_state; private int state; private String current_url; private String destination; private String proxy_pass; private String proxy_user; private Vector encountered_urls; private Vector failed_urls; private WGet mummy; public static int COMPLETE = 0; public static int PAUSED = 1; public static int RUNNING = 2; public static int STOPPED = 3; /** */ public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) { // this.model = model; String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log"; File log_file = new File(log_filename); if(log_file.exists()) { log_file.delete(); } File parent_log_file = log_file.getParentFile(); parent_log_file.mkdirs(); parent_log_file = null; log_file = null; this.debug = debug; this.download_log = new AppendLineOnlyFileDocument(log_filename, false); this.no_parents = no_parents; this.other_hosts = other_hosts; this.page_requisites = page_requisites; this.quiet = quiet; this.initial = new GURL(initial); this.depth = depth; this.destination = destination; this.proxy_pass = proxy_pass; this.proxy_user = proxy_user; this.mummy = mummy; progress = new DownloadProgressBar(this, initial.toString(), simple); encountered_urls = new Vector(); failed_urls = new Vector(); previous_state = STOPPED; state = STOPPED; } /** Depending on which button on the progress bar was pushed, * this method will affect the state of the DownloadJob and perhaps make * calls to wget.class if necessary. * @param event The ActionEvent fired from within the DownloadProgressBar * which we must respond to. */ public void actionPerformed(ActionEvent event) { // The stop_start_button is used to alternately start or stop the // job. If the current state of the job is paused then this // restart is logically equivelent to a resume. if(event.getSource() == progress.stop_start_button) { previous_state = state; if (state == RUNNING) { state = STOPPED; } else { //previous_state = state; state = RUNNING; mummy.resumeThread(); } } else if (event.getSource() == progress.close_button) { if(state == RUNNING) { previous_state = state; state = STOPPED; // do we need to do anything else to stop this? } // else { mummy.deleteDownloadJob(this); // } } } /** Called by the WGet native code to inform us of a new download starting. * @param raw_url The url that is being downloaded, as a String. */ public void addDownload(String raw_url) { if(!encountered_urls.contains(raw_url)) { encountered_urls.add(raw_url); } // Regardless create a new GURL current_url = raw_url; url = new GURL(raw_url); progress.addDownload(raw_url); } /** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob * to decide if it already knows about this url, and if not to * update its progress bar. * @param raw_url The url in question as a String. * @param type Whether the link is an internal or external link. * @return A boolean indicating if the url was added. */ public boolean addLink(String raw_url, int type) { ///ystem.out.println("addLink("+url+", "+type+")"); if(!encountered_urls.contains(raw_url)) { // Add it to the urls we've seen. encountered_urls.add(raw_url); // Add it the to links for the current GURL. // Add it to the progress file count. progress.increaseFileCount(); return true; } // Regardless add it to the children links of the current GURL initial.addLink(raw_url); // We've seen it before. Don't count it again. return false; } public void callWGet() { // Build parameter string. Note that we never clobber, and we continue if possible // want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c "; if (no_parents) { command = command + "-np "; } if(depth < 0) { // Infinite recursion command = command + "-r "; } else if (depth == 0) { // Just this page. } else if (depth > 0) { // Recursion to the specified depth. command = command + "-r -l" + depth + " "; } String proxy_url = ""; // Determine if we have to use a proxy. if(Configuration.get("general.use_proxy", true)) { String proxy_host = Configuration.getString("general.proxy_host", true); String proxy_port = Configuration.getString("general.proxy_port", true); // Find out whether the user has already authenticated themselves String user_pass = null; String address = proxy_host + ":" + proxy_port; int count = 0; while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) { Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP"); count++; } if(count >= 3) { state = STOPPED; return; } if(user_pass.indexOf("@") != -1) { // Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this if (Utility.isWindows()) { command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on "; } else { String user_name = user_pass.substring(0, user_pass.indexOf("@")); String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1); proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/"; } } else { DebugStream.println("Unknown user/pass"); } } // The user can choose to mirror all of the page requisites... if(page_requisites) { command = command + "-p "; } // Download files from other hosts if(other_hosts) { command = command + "-H "; } // Finally tell it the site to download. command = command + initial.toString(); if(previous_state == DownloadJob.COMPLETE) { progress.mirrorBegun(true, true); } else { progress.mirrorBegun(false, true); } File dest_file = new File(destination); if (!dest_file.exists()) { dest_file.mkdirs(); } // Run it try { //DebugStream.println("Cmd: " + command); // don't print it out cos it may have the password in it Runtime rt = Runtime.getRuntime(); String [] env = null; if (!proxy_url.equals("")) { env = new String[2]; env[0] = "http_proxy=http://"+proxy_url; env[1] = "ftp_proxy=ftp://"+proxy_url; } Process prcs = rt.exec(command, env, dest_file); InputStreamReader isr = new InputStreamReader(prcs.getErrorStream()); BufferedReader br = new BufferedReader(isr); // Capture the standard error stream and seach for two particular occurances. String line; boolean ignore_for_robots = false; while ((line = br.readLine()) != null && state != STOPPED) { DebugStream.println(line); download_log.appendLine(line); // The first magic special test is to see if we've just // asked for the robots.txt file. If so we ignore // the next add and then the next complete/error. if(line.lastIndexOf("robots.txt;") != -1) { DebugStream.println("***** Requesting robot.txt"); ignore_for_robots = true; } // If line contains "=> `" display text as the // currently downloading url. Unique to add download. else if(line.lastIndexOf("=> `") != -1) { if(!ignore_for_robots) { // Add download String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'")); addDownload("http:/" + new_url); } } // If line contains "/s) - `" set currently // downloading url to "Download Complete". else if(line.lastIndexOf("/s) - `") != -1) { String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'")); if(!ignore_for_robots) { DebugStream.println("Not ignore for robots"); // Download complete downloadComplete(current_file_downloading); } else { DebugStream.println("Ignore for robots"); ignore_for_robots = false; } } // The already there line begins "File `..." However this // is only true in english, so instead I looked and there // are few (if any at all) other messages than those above // and not overwriting messages that use " `" so we'll // look for that. Note this method is not guarenteed to be // unique like the previous two. else if(line.lastIndexOf(" `") != -1) { // Not Overwriting DebugStream.println("Already there."); String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'")); addDownload("http:/" + new_url); downloadWarning(); } // Any other important message starts with the time in the form hh:mm:ss else if(line.length() > 7) { if(line.charAt(2) == ':' && line.charAt(5) == ':') { if(!ignore_for_robots) { DebugStream.println("Error."); downloadFailed(); } else { ignore_for_robots = false; } } } } if(state == STOPPED) { isr.close(); prcs.destroy(); // This doesn't always work, but it's worth a try } else { // Now display final message based on exit value prcs.waitFor(); } } catch (Exception ioe) { //message(Utility.ERROR, ioe.toString()); DebugStream.printStackTrace(ioe); } // If we've got to here and the state isn't STOPPED then the // job is complete. if(state == DownloadJob.RUNNING) { progress.mirrorComplete(); previous_state = state; state = DownloadJob.COMPLETE; } // refresh the workspace tree Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED); } /** The most important part of the DownloadJob class, this method is * responsible for calling the WGet native methods used to * mirror the indicated url. By this stage all the variables * necessary should be set and we need only build up the * parameter string and make the call. */ public void callWGetNative() { Vector args = new Vector(); // Let the DownloadProgressBar know we're starting, just in case // the user hasn't told us to. If this is the second time the // urls downloaded and the first attempt was successful (ie // the previous job was complete), then we have the case where // the user is forcing us to remirror. Reset all the values etc // if this is the case then reset the variables. // Note that this can cause the result line to look something // like this. // Downloaded 12 of 12 files (8 warnings, 0 errors). // The warnings would be something like, 'File already downloaded' // but the total number of files and the file successfully // downloaded will be correct. if(previous_state == DownloadJob.COMPLETE) { progress.mirrorBegun(true, false); } else { progress.mirrorBegun(false, false); } // Parse arguments into array. args.add(Utility.BASE_DIR + "wget"); args.add("-d"); args.add("-o"); args.add("debug.txt"); if(destination != null) { args.add("-P"); args.add(destination); } if(depth < 0) { // Infinite recursion args.add("-r"); } else if (depth == 0) { // Just this page. } else if (depth > 0) { // Recursion to the specified depth. args.add("-r"); args.add("-l"); args.add("" + depth + ""); // Hacky } if(previous_state == PAUSED) { args.add("-nc"); args.add("-c"); } if(proxy_user != null) { args.add("--proxy-user=" + proxy_user); args.add("--proxy-passwd=" + proxy_pass); } if(page_requisites) { args.add("-p"); } if(quiet) { args.add("-q"); } if(other_hosts) { args.add("-H"); } args.add(initial.toString()); DebugStream.println("Calling wget "); for(Enumeration e = args.elements(); e.hasMoreElements();) { DebugStream.println(e.nextElement() + " "); } DebugStream.println(""); // Run home to mummy. int value = mummy.wget(args.size(), args.toArray(), debug); // If we've got to here and the state isn't STOPPED then the job is complete. if(state == RUNNING) { progress.mirrorComplete(); previous_state = state; state = COMPLETE; } } /** Called by the WGet native code when the current download is * completed. In turn all download listeners are informed. */ public void downloadComplete() { progress.downloadComplete(); url = null; current_url = null; } public void downloadComplete(String current_file_downloading) { progress.downloadComplete(); DebugStream.println("Current File: " + current_file_downloading); // !! TEMPORARILY DISABLED !! //WorkspaceTreeModel.refreshWebCacheMappings(); // if(Gatherer.g_man.gather_pane.workspace_tree != null) { // FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel(); // File new_file = new File(current_file_downloading); // File parent_file = new_file.getParentFile(); // String download_cache = Utility.getCacheDir().getAbsolutePath(); // ArrayList raw_path = new ArrayList(); // while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) { // raw_path.add(0, parent_file.getName()); // parent_file = parent_file.getParentFile(); // } // download_cache = null; // // Add download cache name // /** @todo - add to dictionary */ // raw_path.add(0, "Mirroring.Mirror_Cache"); // // And the root node // raw_path.add(0, tree_model.getRoot()); // TreePath destination_path = new TreePath(raw_path.toArray()); // raw_path = null; // // Retrieve the destination node // FileNode destination_node = (FileNode) tree_model.getNode(destination_path); // // destination_path = null; // //FileNode new_file_node = new FileNode(new_file); // // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies. // ///atherer.println("Ready to insert new FileNode."); // DebugStream.println("Model: " + tree_model); // DebugStream.println("Destination path: " + destination_path); // destination_node.unmap(); // ///atherer.println("Destination node: " + destination_node); // ///atherer.println("New node: " + new_file_node); // //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node); // //new_file_node = null; // destination_node = null; // tree_model = null; // } // url = null; // current_url = null; } /** Called by the WGet native code when the requested download returns * a status code other than 200. */ public void downloadFailed() { ///ystem.out.println("downloadFailed("+current_url+")"); failed_urls.add(current_url); // Its the current url thats failed. progress.downloadFailed(); } /** */ public void downloadWarning() { progress.downloadWarning(); } /** * @return A String representing the currently downloading url. */ /* private String getCurrent() { return current_url; } */ /** * @return A String representing the initial urls host (root node * of tree that we are mirroring). */ public String getHost() { return url.getHost(); } public AppendLineOnlyFileDocument getLogDocument() { return download_log; } /** * @return Returns the progress bar associated with this job. */ public DownloadProgressBar getProgressBar() { return progress; } /** Called to discover if the user wanted this thread to run or if * it is paused. * @return An int representing the current DownloadJob state. */ public int getState() { return state; } /** Returns the current state of the stop flag for this job. * @return A boolean representing whether the user has requested to * stop. */ public boolean hasSignalledStop() { if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED || state == DownloadJob.COMPLETE) { return true; } return false; } public void setState(int state) { previous_state = this.state; this.state = state; } /** A convinence call. * @return A String representing the url of the initial url (root node of the mirrored tree). */ public String toString() { return initial.toString(); } /** Called by the WGet native code to signal the current progress of * downloading. * @param current A long representing the number of bytes that have * been downloaded since the last update. * @param expected A long representing the total number of bytes * expected for this download. */ public void updateProgress(long current, long expected) { progress.updateProgress(current, expected); } }