/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * *

* * Author: John Thompson, Greenstone Digital Library, University of Waikato * *

* * Copyright (C) 1999 New Zealand Digital Library Project * *

* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * *

* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * *

* * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.collection; import java.awt.event.*; import java.io.*; import java.net.*; import java.util.*; import javax.swing.tree.*; import org.greenstone.gatherer.Dictionary; import org.greenstone.gatherer.Gatherer; import org.greenstone.gatherer.WGet; import org.greenstone.gatherer.gui.GProgressBar; import org.greenstone.gatherer.util.GURL; import org.greenstone.gatherer.util.Utility; /** * @author John Thompson, Greenstone Digital Library, University of Waikato * @version 2.0 */ public class Job implements ActionListener { private boolean clobber; private boolean debug; private boolean higher_directories; private boolean no_parents; private boolean other_hosts; private boolean page_requisites; private boolean quiet; private GProgressBar progress; private GURL initial = null; private GURL url = null; private TreeModel model; private int depth; private int previous_state; private int state; private String current_url; private String destination; private String proxy_pass; private String proxy_user; private Vector encountered_urls; private Vector failed_urls; private WGet mummy; public static int COMPLETE = 0; public static int PAUSED = 1; public static int RUNNING = 2; public static int STOPPED = 3; /** */ public Job(TreeModel model, boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) { this.model = model; this.debug = debug; this.clobber = clobber; this.no_parents = no_parents; this.other_hosts = other_hosts; this.page_requisites = page_requisites; this.quiet = quiet; this.initial = new GURL(initial); this.depth = depth; this.destination = destination; this.proxy_pass = proxy_pass; this.proxy_user = proxy_user; this.mummy = mummy; progress = new GProgressBar(this, initial.toString(), simple); encountered_urls = new Vector(); failed_urls = new Vector(); previous_state = STOPPED; state = STOPPED; } /** Depending on which button on the progress bar was pushed, * this method will affect the state of the Job and perhaps make * calls to wget.class if necessary. * @param event The ActionEvent fired from within the GProgressBar * which we must respond to. */ public void actionPerformed(ActionEvent event) { // The action button is used to alternately start or stop the // job. If the current state of the job is paused then this // restart is logically equivelent to a resume. if(event.getSource() == progress.action) { previous_state = state; if(state == RUNNING) { state = PAUSED; } else { state = RUNNING; mummy.resumeThread(); } } else if (event.getSource() == progress.cancel) { state = STOPPED; // Should already be stopped. mummy.deleteJob(this); } } /** Called by the WGet native code to inform us of a new download starting. * @param url The url that is being downloaded, as a String. */ public void addDownload(String raw_url) { if(!encountered_urls.contains(raw_url)) { encountered_urls.add(raw_url); } // Regardless create a new GURL current_url = raw_url; url = new GURL(raw_url); progress.addDownload(raw_url); } /** Used to advise the Job of a newly parsed link. Its up to Job * to decide if it already knows about this url, and if not to * update its progress bar. * @param url The url in question as a String. * @param type Whether the link is an internal or external link. * @return A boolean indicating if the url was added. */ public boolean addLink(String raw_url, int type) { ///ystem.out.println("addLink("+url+", "+type+")"); if(!encountered_urls.contains(raw_url)) { // Add it to the urls we've seen. encountered_urls.add(raw_url); // Add it the to links for the current GURL. // Add it to the progress file count. progress.increaseFileCount(); return true; } // Regardless add it to the children links of the current GURL initial.addLink(raw_url); // We've seen it before. Don't count it again. return false; } public void callWGet() { // Build parameter string String command = "wget "; // Parse arguments into array. // Always: // rewrite links to be local if possible - NOOOOOO, // output a debug file and debug messages, // run quietly. //command = command + "-k "; if(destination != null) { command = command + "-P " + destination + " "; } if(depth < 0) { // Infinite recursion command = command + "-r "; } else if (depth == 0) { // Just this page. } else if (depth > 0) { // Recursion to the specified depth. command = command + "-r -l" + depth + " "; } if(!clobber || previous_state == Job.PAUSED) { command = command + "-nc -c "; } // Determine if we have to use a proxy. if(Gatherer.config.get("general.use_proxy", true)) { String proxy_host = Gatherer.config.getString("general.proxy_host", true); String proxy_port = Gatherer.config.getString("general.proxy_port", true); // Find out whether the user has already authenticated themselves String user_pass = null; String address = proxy_host + ":" + proxy_port; int count = 0; while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) { Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP"); count++; } if(count >= 3) { state = STOPPED; return; } if(user_pass.indexOf("@") != -1) { // Write the use proxy command command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on "; } else { Gatherer.println("Unknown user/pass"); } } if(page_requisites) { command = command + "-p "; } if(other_hosts) { command = command + "-H "; } // Finally tell it the site to download. command = command + initial.toString(); if(previous_state == Job.COMPLETE) { progress.mirrorBegun(true, true); } else { progress.mirrorBegun(false, true); } // Run it try { Gatherer.println("Cmd: " + command); Runtime rt = Runtime.getRuntime(); Process prcs = rt.exec(command); InputStreamReader isr = new InputStreamReader(prcs.getErrorStream()); BufferedReader br = new BufferedReader(isr); // Capture the standard error stream and seach for two particular occurances. String line; boolean ignore_for_robots = false; while ((line = br.readLine()) != null) { Gatherer.println(line); // The first magic special test is to see if we've just // asked for the robots.txt file. If so we ignore // the next add and then the next complete/error. if(line.lastIndexOf("robots.txt;") != -1) { Gatherer.println("***** Requesting robot.txt"); ignore_for_robots = true; } // If line contains "=> `" display text as the // currently downloading url. Unique to add download. else if(line.lastIndexOf("=> `") != -1) { if(!ignore_for_robots) { // Add download String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'")); // Remove the destination guff if(destination != null) { new_url = new_url.substring(destination.length()); } addDownload("http:/" + new_url); } } // If line contains "saved []" set currently // downloading url to "Download Complete". else if(line.lastIndexOf(") - `") != -1) { if(!ignore_for_robots) { // Download complete downloadComplete(); } else { ignore_for_robots = false; } } // The already there line begins "File `..." However this // is only true in english, so instead I looked and there // are few (if any at all) other messages than those above // and not overwriting messages that use " `" so we'll // look for that. Note this method is not guarenteed to be // unique like the previous two. else if(line.lastIndexOf(" `") != -1) { // Not Overwriting Gatherer.println("Already there."); String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'")); // For some strange reason this won't compile // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol // symbol : class CAKE // location: class org.greenstone.gatherer.collection.Job /* *********************************************************** CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!! *********************************************************** */ // Remove the destination guff if(destination != null) { new_url = new_url.substring(destination.length()); } addDownload("http:/" + new_url); downloadWarning(); } // Any other important message starts with the time in the form hh:mm:ss else if(line.length() > 7) { if(line.charAt(2) == ':' && line.charAt(5) == ':') { if(!ignore_for_robots) { Gatherer.println("Error."); downloadFailed(); } else { ignore_for_robots = false; } } } } // Now display final message based on exit value prcs.waitFor(); } catch (Exception ioe) { //message(Utility.ERROR, ioe.toString()); Gatherer.printStackTrace(ioe); } // If we've got to here and the state isn't STOPPED then the // job is complete. if(state == Job.RUNNING) { progress.mirrorComplete(); previous_state = state; state = Job.COMPLETE; } } /** The most important part of the Job class, this method is * responsible for calling the WGet native methods used to * mirror the indicated url. By this stage all the variables * necessary should be set and we need only build up the * parameter string and make the call. */ public void callWGetNative() { Vector args = new Vector(); // Let the GProgressBar know we're starting, just in case // the user hasn't told us to. If this is the second time the // urls downloaded and the first attempt was successful (ie // the previous job was complete), then we have the case where // the user is forcing us to remirror. Reset all the values etc // if this is the case then reset the variables. // Note that this can cause the result line to look something // like this. // Downloaded 12 of 12 files (8 warnings, 0 errors). // The warnings would be something like, 'File already downloaded' // but the total number of files and the file successfully // downloaded will be correct. if(previous_state == Job.COMPLETE) { progress.mirrorBegun(true, false); } else { progress.mirrorBegun(false, false); } // Parse arguments into array. args.add(Utility.BASE_DIR + "wget"); //args.add("-k"); args.add("-d"); args.add("-o"); args.add("debug.txt"); if(destination != null) { args.add("-P"); args.add(destination); } if(depth < 0) { // Infinite recursion args.add("-r"); } else if (depth == 0) { // Just this page. } else if (depth > 0) { // Recursion to the specified depth. args.add("-r"); args.add("-l"); args.add("" + depth + ""); // Hacky } if(!clobber || previous_state == PAUSED) { args.add("-nc"); args.add("-c"); } if(proxy_user != null) { args.add("--proxy-user=" + proxy_user); args.add("--proxy-passwd=" + proxy_pass); } if(page_requisites) { args.add("-p"); } if(quiet) { args.add("-q"); } if(other_hosts) { args.add("-H"); } args.add(initial.toString()); Gatherer.println("Calling wget "); for(Enumeration e = args.elements(); e.hasMoreElements();) { Gatherer.println(e.nextElement() + " "); } Gatherer.println(""); // Run home to mummy. int value = mummy.wget(args.size(), args.toArray(), debug); // If we've got to here and the state isn't STOPPED then the job is complete. if(state == RUNNING) { progress.mirrorComplete(); previous_state = state; state = COMPLETE; } } /** Called by the WGet native code when the current download is * completed. In turn all download listeners are informed. */ public void downloadComplete() { progress.downloadComplete(); /* @todo model.add(url.getURL(), destination); */ url = null; current_url = null; } /** Called by the WGet native code when the requested download returns * a status code other than 200. */ public void downloadFailed() { ///ystem.out.println("downloadFailed("+current_url+")"); failed_urls.add(current_url); // Its the current url thats failed. progress.downloadFailed(); } /** */ public void downloadWarning() { progress.downloadWarning(); } /** * @return A String representing the currently downloading url. */ public String getCurrent() { return current_url; } /** * @return A String representing the initial urls host (root node * of tree that we are mirroring). */ public String getHost() { return url.getHost(); } /** * @return Returns the progress bar associated with this job. */ public GProgressBar getProgressBar() { return progress; } /** Called to discover if the user wanted this thread to run or if * it is paused. * @return An int representing the current Job state. */ public int getState() { return state; } /** Returns the current state of the stop flag for this job. * @return A boolean representing whether the user has requested to * stop. */ public boolean hasSignalledStop() { if(state == Job.STOPPED || state == Job.PAUSED || state == Job.COMPLETE) { return true; } return false; } /** A convinence call. * @return A String representing the url of the initial url (root node of the mirrored tree). */ public String toString() { return initial.toString(); } /** Called by the WGet native code to signal the current progress of * downloading. * @param current A long representing the number of bytes that have * been downloaded since the last update. * @param expected A long representing the total number of bytes * expected for this download. */ public void updateProgress(long current, long expected) { progress.updateProgress(current, expected); } }