/**
*#########################################################################
*
* A component of the Gatherer application, part of the Greenstone digital
* library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
*
*
* Author: John Thompson, Greenstone Digital Library, University of Waikato
*
*
*
* Copyright (C) 1999 New Zealand Digital Library Project
*
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*########################################################################
*/
package org.greenstone.gatherer.collection;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.tree.*;
import org.greenstone.gatherer.Dictionary;
import org.greenstone.gatherer.Gatherer;
import org.greenstone.gatherer.WGet;
import org.greenstone.gatherer.gui.GProgressBar;
import org.greenstone.gatherer.util.GURL;
import org.greenstone.gatherer.util.Utility;
/**
* @author John Thompson, Greenstone Digital Library, University of Waikato
* @version 2.0
*/
public class Job
implements ActionListener {
private boolean clobber;
private boolean debug;
private boolean higher_directories;
private boolean no_parents;
private boolean other_hosts;
private boolean page_requisites;
private boolean quiet;
private GProgressBar progress;
private GURL initial = null;
private GURL url = null;
private TreeModel model;
private int depth;
private int previous_state;
private int state;
private String current_url;
private String destination;
private String proxy_pass;
private String proxy_user;
private Vector encountered_urls;
private Vector failed_urls;
private WGet mummy;
public static int COMPLETE = 0;
public static int PAUSED = 1;
public static int RUNNING = 2;
public static int STOPPED = 3;
/**
*/
public Job(TreeModel model, boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
this.model = model;
this.debug = debug;
this.clobber = clobber;
this.no_parents = no_parents;
this.other_hosts = other_hosts;
this.page_requisites = page_requisites;
this.quiet = quiet;
this.initial = new GURL(initial);
this.depth = depth;
this.destination = destination;
this.proxy_pass = proxy_pass;
this.proxy_user = proxy_user;
this.mummy = mummy;
progress = new GProgressBar(this, initial.toString(), simple);
encountered_urls = new Vector();
failed_urls = new Vector();
previous_state = STOPPED;
state = STOPPED;
}
/** Depending on which button on the progress bar was pushed,
* this method will affect the state of the Job and perhaps make
* calls to wget.class if necessary.
* @param event The ActionEvent fired from within the GProgressBar
* which we must respond to.
*/
public void actionPerformed(ActionEvent event) {
// The action button is used to alternately start or stop the
// job. If the current state of the job is paused then this
// restart is logically equivelent to a resume.
if(event.getSource() == progress.action) {
previous_state = state;
if(state == RUNNING) {
state = PAUSED;
}
else {
state = RUNNING;
mummy.resumeThread();
}
}
else if (event.getSource() == progress.cancel) {
state = STOPPED; // Should already be stopped.
mummy.deleteJob(this);
}
}
/** Called by the WGet native code to inform us of a new download starting.
* @param url The url that is being downloaded, as a String.
*/
public void addDownload(String raw_url) {
if(!encountered_urls.contains(raw_url)) {
encountered_urls.add(raw_url);
}
// Regardless create a new GURL
current_url = raw_url;
url = new GURL(raw_url);
progress.addDownload(raw_url);
}
/** Used to advise the Job of a newly parsed link. Its up to Job
* to decide if it already knows about this url, and if not to
* update its progress bar.
* @param url The url in question as a String.
* @param type Whether the link is an internal or external link.
* @return A boolean indicating if the url was added.
*/
public boolean addLink(String raw_url, int type) {
///ystem.out.println("addLink("+url+", "+type+")");
if(!encountered_urls.contains(raw_url)) {
// Add it to the urls we've seen.
encountered_urls.add(raw_url);
// Add it the to links for the current GURL.
// Add it to the progress file count.
progress.increaseFileCount();
return true;
}
// Regardless add it to the children links of the current GURL
initial.addLink(raw_url);
// We've seen it before. Don't count it again.
return false;
}
public void callWGet() {
// Build parameter string
String command = "wget ";
// Parse arguments into array.
// Always:
// rewrite links to be local if possible - NOOOOOO,
// output a debug file and debug messages,
// run quietly.
//command = command + "-k ";
if(destination != null) {
command = command + "-P " + destination + " ";
}
if(depth < 0) {
// Infinite recursion
command = command + "-r ";
}
else if (depth == 0) {
// Just this page.
}
else if (depth > 0) {
// Recursion to the specified depth.
command = command + "-r -l" + depth + " ";
}
if(!clobber || previous_state == Job.PAUSED) {
command = command + "-nc -c ";
}
// Determine if we have to use a proxy.
if(Gatherer.config.get("general.use_proxy", true)) {
String proxy_host = Gatherer.config.getString("general.proxy_host", true);
String proxy_port = Gatherer.config.getString("general.proxy_port", true);
// Find out whether the user has already authenticated themselves
String user_pass = null;
String address = proxy_host + ":" + proxy_port;
int count = 0;
while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
count++;
}
if(count >= 3) {
state = STOPPED;
return;
}
if(user_pass.indexOf("@") != -1) {
// Write the use proxy command
command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
}
else {
Gatherer.println("Unknown user/pass");
}
}
if(page_requisites) {
command = command + "-p ";
}
if(other_hosts) {
command = command + "-H ";
}
// Finally tell it the site to download.
command = command + initial.toString();
if(previous_state == Job.COMPLETE) {
progress.mirrorBegun(true, true);
}
else {
progress.mirrorBegun(false, true);
}
// Run it
try {
Gatherer.println("Cmd: " + command);
Runtime rt = Runtime.getRuntime();
Process prcs = rt.exec(command);
InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
BufferedReader br = new BufferedReader(isr);
// Capture the standard error stream and seach for two particular occurances.
String line;
boolean ignore_for_robots = false;
while ((line = br.readLine()) != null) {
Gatherer.println(line);
// The first magic special test is to see if we've just
// asked for the robots.txt file. If so we ignore
// the next add and then the next complete/error.
if(line.lastIndexOf("robots.txt;") != -1) {
Gatherer.println("***** Requesting robot.txt");
ignore_for_robots = true;
}
// If line contains "=> `" display text as the
// currently downloading url. Unique to add download.
else if(line.lastIndexOf("=> `") != -1) {
if(!ignore_for_robots) {
// Add download
String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
// Remove the destination guff
if(destination != null) {
new_url = new_url.substring(destination.length());
}
addDownload("http:/" + new_url);
}
}
// If line contains "saved []" set currently
// downloading url to "Download Complete".
else if(line.lastIndexOf(") - `") != -1) {
if(!ignore_for_robots) {
// Download complete
downloadComplete();
}
else {
ignore_for_robots = false;
}
}
// The already there line begins "File `..." However this
// is only true in english, so instead I looked and there
// are few (if any at all) other messages than those above
// and not overwriting messages that use " `" so we'll
// look for that. Note this method is not guarenteed to be
// unique like the previous two.
else if(line.lastIndexOf(" `") != -1) {
// Not Overwriting
Gatherer.println("Already there.");
String new_url =
line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
// For some strange reason this won't compile
// src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
// symbol : class CAKE
// location: class org.greenstone.gatherer.collection.Job
/* ***********************************************************
CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
*********************************************************** */
// Remove the destination guff
if(destination != null) {
new_url = new_url.substring(destination.length());
}
addDownload("http:/" + new_url);
downloadWarning();
}
// Any other important message starts with the time in the form hh:mm:ss
else if(line.length() > 7) {
if(line.charAt(2) == ':' && line.charAt(5) == ':') {
if(!ignore_for_robots) {
Gatherer.println("Error.");
downloadFailed();
}
else {
ignore_for_robots = false;
}
}
}
}
// Now display final message based on exit value
prcs.waitFor();
}
catch (Exception ioe) {
//message(Utility.ERROR, ioe.toString());
Gatherer.printStackTrace(ioe);
}
// If we've got to here and the state isn't STOPPED then the
// job is complete.
if(state == Job.RUNNING) {
progress.mirrorComplete();
previous_state = state;
state = Job.COMPLETE;
}
}
/** The most important part of the Job class, this method is
* responsible for calling the WGet native methods used to
* mirror the indicated url. By this stage all the variables
* necessary should be set and we need only build up the
* parameter string and make the call.
*/
public void callWGetNative() {
Vector args = new Vector();
// Let the GProgressBar know we're starting, just in case
// the user hasn't told us to. If this is the second time the
// urls downloaded and the first attempt was successful (ie
// the previous job was complete), then we have the case where
// the user is forcing us to remirror. Reset all the values etc
// if this is the case then reset the variables.
// Note that this can cause the result line to look something
// like this.
// Downloaded 12 of 12 files (8 warnings, 0 errors).
// The warnings would be something like, 'File already downloaded'
// but the total number of files and the file successfully
// downloaded will be correct.
if(previous_state == Job.COMPLETE) {
progress.mirrorBegun(true, false);
}
else {
progress.mirrorBegun(false, false);
}
// Parse arguments into array.
args.add(Utility.BASE_DIR + "wget");
//args.add("-k");
args.add("-d");
args.add("-o");
args.add("debug.txt");
if(destination != null) {
args.add("-P");
args.add(destination);
}
if(depth < 0) {
// Infinite recursion
args.add("-r");
}
else if (depth == 0) {
// Just this page.
}
else if (depth > 0) {
// Recursion to the specified depth.
args.add("-r");
args.add("-l");
args.add("" + depth + ""); // Hacky
}
if(!clobber || previous_state == PAUSED) {
args.add("-nc");
args.add("-c");
}
if(proxy_user != null) {
args.add("--proxy-user=" + proxy_user);
args.add("--proxy-passwd=" + proxy_pass);
}
if(page_requisites) {
args.add("-p");
}
if(quiet) {
args.add("-q");
}
if(other_hosts) {
args.add("-H");
}
args.add(initial.toString());
Gatherer.println("Calling wget ");
for(Enumeration e = args.elements(); e.hasMoreElements();) {
Gatherer.println(e.nextElement() + " ");
}
Gatherer.println("");
// Run home to mummy.
int value = mummy.wget(args.size(), args.toArray(), debug);
// If we've got to here and the state isn't STOPPED then the job is complete.
if(state == RUNNING) {
progress.mirrorComplete();
previous_state = state;
state = COMPLETE;
}
}
/** Called by the WGet native code when the current download is
* completed. In turn all download listeners are informed.
*/
public void downloadComplete() {
progress.downloadComplete();
/* @todo
model.add(url.getURL(), destination);
*/
url = null;
current_url = null;
}
/** Called by the WGet native code when the requested download returns
* a status code other than 200.
*/
public void downloadFailed() {
///ystem.out.println("downloadFailed("+current_url+")");
failed_urls.add(current_url); // Its the current url thats failed.
progress.downloadFailed();
}
/**
*/
public void downloadWarning() {
progress.downloadWarning();
}
/**
* @return A String representing the currently downloading url.
*/
public String getCurrent() {
return current_url;
}
/**
* @return A String representing the initial urls host (root node
* of tree that we are mirroring).
*/
public String getHost() {
return url.getHost();
}
/**
* @return Returns the progress bar associated with this job.
*/
public GProgressBar getProgressBar() {
return progress;
}
/** Called to discover if the user wanted this thread to run or if
* it is paused.
* @return An int representing the current Job state.
*/
public int getState() {
return state;
}
/** Returns the current state of the stop flag for this job.
* @return A boolean representing whether the user has requested to
* stop.
*/
public boolean hasSignalledStop() {
if(state == Job.STOPPED || state == Job.PAUSED ||
state == Job.COMPLETE) {
return true;
}
return false;
}
/** A convinence call.
* @return A String representing the url of the initial url (root node of the mirrored tree).
*/
public String toString() {
return initial.toString();
}
/** Called by the WGet native code to signal the current progress of
* downloading.
* @param current A long representing the number of bytes that have
* been downloaded since the last update.
* @param expected A long representing the total number of bytes
* expected for this download.
*/
public void updateProgress(long current, long expected) {
progress.updateProgress(current, expected);
}
}