Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 8236

Last change on this file since 8236 was 8236, checked in by mdewsnip, 20 years ago
Replaced all Gatherer.print* with DebugStream.print*.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.8 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.collection;
38
39	import java.awt.event.*;
40	import java.io.*;
41	import java.net.*;
42	import java.util.*;
43	import javax.swing.tree.*;
44	import org.greenstone.gatherer.Configuration;
45	import org.greenstone.gatherer.DebugStream;
46	import org.greenstone.gatherer.Dictionary;
47	import org.greenstone.gatherer.Gatherer;
48	import org.greenstone.gatherer.WGet;
49	import org.greenstone.gatherer.file.FileNode;
50	import org.greenstone.gatherer.file.FileSystemModel;
51	import org.greenstone.gatherer.file.WorkspaceTreeModel;
52	import org.greenstone.gatherer.gui.DownloadProgressBar;
53	import org.greenstone.gatherer.gui.tree.WorkspaceTree;
54	import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
55	import org.greenstone.gatherer.util.GURL;
56	import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
57	import org.greenstone.gatherer.util.Utility;
58	/**
59	* @author John Thompson, Greenstone Digital Library, University of Waikato
60	* @version 2.0
61	*/
62	public class DownloadJob
63	implements ActionListener {
64
65	private boolean debug;
66	private boolean higher_directories;
67	private boolean no_parents;
68	private boolean other_hosts;
69	private boolean page_requisites;
70	private boolean quiet;
71
72	private AppendLineOnlyFileDocument download_log;
73
74	private DownloadProgressBar progress;
75
76	private GURL initial = null;
77	private GURL url = null;
78
79	// private TreeModel model;
80
81	private int depth;
82	private int previous_state;
83	private int state;
84
85	private String current_url;
86	private String destination;
87	private String proxy_pass;
88	private String proxy_user;
89
90	private Vector encountered_urls;
91	private Vector failed_urls;
92
93	private WGet mummy;
94
95	public static int COMPLETE = 0;
96	public static int PAUSED = 1;
97	public static int RUNNING = 2;
98	public static int STOPPED = 3;
99
100	/**
101	*/
102	public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
103	// this.model = model;
104
105	String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
106	File log_file = new File(log_filename);
107	if(log_file.exists()) {
108	log_file.delete();
109	}
110	File parent_log_file = log_file.getParentFile();
111	parent_log_file.mkdirs();
112	parent_log_file = null;
113	log_file = null;
114
115	this.debug = debug;
116	this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
117	this.no_parents = no_parents;
118	this.other_hosts = other_hosts;
119	this.page_requisites = page_requisites;
120	this.quiet = quiet;
121	this.initial = new GURL(initial);
122	this.depth = depth;
123	this.destination = destination;
124	this.proxy_pass = proxy_pass;
125	this.proxy_user = proxy_user;
126	this.mummy = mummy;
127
128	progress = new DownloadProgressBar(this, initial.toString(), simple);
129
130	encountered_urls = new Vector();
131	failed_urls = new Vector();
132
133	previous_state = STOPPED;
134	state = STOPPED;
135	}
136
137	/** Depending on which button on the progress bar was pushed,
138	* this method will affect the state of the DownloadJob and perhaps make
139	* calls to wget.class if necessary.
140	* @param event The ActionEvent fired from within the DownloadProgressBar
141	* which we must respond to.
142	*/
143	public void actionPerformed(ActionEvent event) {
144	// The stop_start_button is used to alternately start or stop the
145	// job. If the current state of the job is paused then this
146	// restart is logically equivelent to a resume.
147	if(event.getSource() == progress.stop_start_button) {
148	previous_state = state;
149	if (state == RUNNING) {
150	state = STOPPED;
151	} else {
152	//previous_state = state;
153	state = RUNNING;
154	mummy.resumeThread();
155	}
156	}
157	else if (event.getSource() == progress.close_button) {
158	if(state == RUNNING) {
159	previous_state = state;
160	state = STOPPED; // do we need to do anything else to stop this?
161	}
162	// else {
163	mummy.deleteDownloadJob(this);
164	// }
165	}
166	}
167
168	/** Called by the WGet native code to inform us of a new download starting.
169	* @param raw_url The url that is being downloaded, as a String.
170	*/
171	public void addDownload(String raw_url) {
172	if(!encountered_urls.contains(raw_url)) {
173	encountered_urls.add(raw_url);
174	}
175	// Regardless create a new GURL
176	current_url = raw_url;
177	url = new GURL(raw_url);
178	progress.addDownload(raw_url);
179	}
180
181	/** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
182	* to decide if it already knows about this url, and if not to
183	* update its progress bar.
184	* @param raw_url The url in question as a String.
185	* @param type Whether the link is an internal or external link.
186	* @return A boolean indicating if the url was added.
187	*/
188	public boolean addLink(String raw_url, int type) {
189	///ystem.out.println("addLink("+url+", "+type+")");
190	if(!encountered_urls.contains(raw_url)) {
191	// Add it to the urls we've seen.
192	encountered_urls.add(raw_url);
193	// Add it the to links for the current GURL.
194
195	// Add it to the progress file count.
196	progress.increaseFileCount();
197	return true;
198	}
199	// Regardless add it to the children links of the current GURL
200	initial.addLink(raw_url);
201
202	// We've seen it before. Don't count it again.
203	return false;
204	}
205
206	public void callWGet() {
207	// Build parameter string. Note that we never clobber, and we continue if possible
208
209	// want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
210	String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
211
212	if (no_parents) {
213	command = command + "-np ";
214	}
215	if(depth < 0) {
216	// Infinite recursion
217	command = command + "-r ";
218	}
219	else if (depth == 0) {
220	// Just this page.
221	}
222	else if (depth > 0) {
223	// Recursion to the specified depth.
224	command = command + "-r -l" + depth + " ";
225	}
226
227	String proxy_url = "";
228	// Determine if we have to use a proxy.
229	if(Configuration.get("general.use_proxy", true)) {
230	String proxy_host = Configuration.getString("general.proxy_host", true);
231	String proxy_port = Configuration.getString("general.proxy_port", true);
232	// Find out whether the user has already authenticated themselves
233	String user_pass = null;
234	String address = proxy_host + ":" + proxy_port;
235	int count = 0;
236	while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
237	Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
238	count++;
239	}
240	if(count >= 3) {
241	state = STOPPED;
242	return;
243	}
244	if(user_pass.indexOf("@") != -1) {
245
246	// Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
247	if (Utility.isWindows()) {
248	command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
249	} else {
250	String user_name = user_pass.substring(0, user_pass.indexOf("@"));
251	String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
252	proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
253	}
254
255	}
256	else {
257	DebugStream.println("Unknown user/pass");
258	}
259	}
260
261	// The user can choose to mirror all of the page requisites...
262	if(page_requisites) {
263	command = command + "-p ";
264	}
265
266	// Download files from other hosts
267	if(other_hosts) {
268	command = command + "-H ";
269	}
270
271	// Finally tell it the site to download.
272	command = command + initial.toString();
273
274	if(previous_state == DownloadJob.COMPLETE) {
275	progress.mirrorBegun(true, true);
276	}
277	else {
278	progress.mirrorBegun(false, true);
279	}
280
281	File dest_file = new File(destination);
282	if (!dest_file.exists()) {
283	dest_file.mkdirs();
284	}
285	// Run it
286	try {
287	//DebugStream.println("Cmd: " + command); // don't print it out cos it may have the password in it
288	Runtime rt = Runtime.getRuntime();
289	String [] env = null;
290	if (!proxy_url.equals("")) {
291	env = new String[2];
292	env[0] = "http_proxy=http://"+proxy_url;
293	env[1] = "ftp_proxy=ftp://"+proxy_url;
294	}
295	Process prcs = rt.exec(command, env, dest_file);
296	InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
297	BufferedReader br = new BufferedReader(isr);
298	// Capture the standard error stream and seach for two particular occurances.
299	String line;
300	boolean ignore_for_robots = false;
301	while ((line = br.readLine()) != null && state != STOPPED) {
302
303	DebugStream.println(line);
304	download_log.appendLine(line);
305	// The first magic special test is to see if we've just
306	// asked for the robots.txt file. If so we ignore
307	// the next add and then the next complete/error.
308	if(line.lastIndexOf("robots.txt;") != -1) {
309	DebugStream.println("***** Requesting robot.txt");
310	ignore_for_robots = true;
311	}
312	// If line contains "=> `" display text as the
313	// currently downloading url. Unique to add download.
314	else if(line.lastIndexOf("=> `") != -1) {
315	if(!ignore_for_robots) {
316	// Add download
317	String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
318	addDownload("http:/" + new_url);
319	}
320	}
321	// If line contains "/s) - `" set currently
322	// downloading url to "Download Complete".
323	else if(line.lastIndexOf("/s) - `") != -1) {
324	String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
325	if(!ignore_for_robots) {
326	DebugStream.println("Not ignore for robots");
327	// Download complete
328	downloadComplete(current_file_downloading);
329	}
330	else {
331	DebugStream.println("Ignore for robots");
332	ignore_for_robots = false;
333	}
334	}
335	// The already there line begins "File `..." However this
336	// is only true in english, so instead I looked and there
337	// are few (if any at all) other messages than those above
338	// and not overwriting messages that use " `" so we'll
339	// look for that. Note this method is not guarenteed to be
340	// unique like the previous two.
341	else if(line.lastIndexOf(" `") != -1) {
342	// Not Overwriting
343	DebugStream.println("Already there.");
344	String new_url =
345	line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
346	addDownload("http:/" + new_url);
347	downloadWarning();
348	}
349	// Any other important message starts with the time in the form hh:mm:ss
350	else if(line.length() > 7) {
351	if(line.charAt(2) == ':' && line.charAt(5) == ':') {
352	if(!ignore_for_robots) {
353	DebugStream.println("Error.");
354	downloadFailed();
355	}
356	else {
357	ignore_for_robots = false;
358	}
359	}
360	}
361	}
362	if(state == STOPPED) {
363	isr.close();
364	prcs.destroy(); // This doesn't always work, but it's worth a try
365	}
366	else {
367	// Now display final message based on exit value
368	prcs.waitFor();
369	}
370	}
371	catch (Exception ioe) {
372	//message(Utility.ERROR, ioe.toString());
373	DebugStream.printStackTrace(ioe);
374	}
375	// If we've got to here and the state isn't STOPPED then the
376	// job is complete.
377	if(state == DownloadJob.RUNNING) {
378	progress.mirrorComplete();
379	previous_state = state;
380	state = DownloadJob.COMPLETE;
381
382	}
383	// refresh the workspace tree
384	Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
385	}
386
387
388	/** The most important part of the DownloadJob class, this method is
389	* responsible for calling the WGet native methods used to
390	* mirror the indicated url. By this stage all the variables
391	* necessary should be set and we need only build up the
392	* parameter string and make the call.
393	*/
394	public void callWGetNative() {
395	Vector args = new Vector();
396
397	// Let the DownloadProgressBar know we're starting, just in case
398	// the user hasn't told us to. If this is the second time the
399	// urls downloaded and the first attempt was successful (ie
400	// the previous job was complete), then we have the case where
401	// the user is forcing us to remirror. Reset all the values etc
402	// if this is the case then reset the variables.
403	// Note that this can cause the result line to look something
404	// like this.
405	// Downloaded 12 of 12 files (8 warnings, 0 errors).
406	// The warnings would be something like, 'File already downloaded'
407	// but the total number of files and the file successfully
408	// downloaded will be correct.
409	if(previous_state == DownloadJob.COMPLETE) {
410	progress.mirrorBegun(true, false);
411	}
412	else {
413	progress.mirrorBegun(false, false);
414	}
415
416	// Parse arguments into array.
417	args.add(Utility.BASE_DIR + "wget");
418	args.add("-d");
419	args.add("-o");
420	args.add("debug.txt");
421
422	if(destination != null) {
423	args.add("-P");
424	args.add(destination);
425	}
426
427	if(depth < 0) {
428	// Infinite recursion
429	args.add("-r");
430	}
431	else if (depth == 0) {
432	// Just this page.
433	}
434	else if (depth > 0) {
435	// Recursion to the specified depth.
436	args.add("-r");
437	args.add("-l");
438	args.add("" + depth + ""); // Hacky
439	}
440
441	if(previous_state == PAUSED) {
442	args.add("-nc");
443	args.add("-c");
444	}
445
446	if(proxy_user != null) {
447	args.add("--proxy-user=" + proxy_user);
448	args.add("--proxy-passwd=" + proxy_pass);
449	}
450
451	if(page_requisites) {
452	args.add("-p");
453	}
454
455	if(quiet) {
456	args.add("-q");
457	}
458
459	if(other_hosts) {
460	args.add("-H");
461	}
462
463	args.add(initial.toString());
464
465	DebugStream.println("Calling wget ");
466	for(Enumeration e = args.elements(); e.hasMoreElements();) {
467	DebugStream.println(e.nextElement() + " ");
468	}
469	DebugStream.println("");
470
471	// Run home to mummy.
472	int value = mummy.wget(args.size(), args.toArray(), debug);
473
474	// If we've got to here and the state isn't STOPPED then the job is complete.
475	if(state == RUNNING) {
476	progress.mirrorComplete();
477	previous_state = state;
478	state = COMPLETE;
479	}
480	}
481
482	/** Called by the WGet native code when the current download is
483	* completed. In turn all download listeners are informed.
484	*/
485	public void downloadComplete() {
486	progress.downloadComplete();
487	url = null;
488	current_url = null;
489	}
490
491	public void downloadComplete(String current_file_downloading) {
492	progress.downloadComplete();
493	DebugStream.println("Current File: " + current_file_downloading);
494	// !! TEMPORARILY DISABLED !!
495	//WorkspaceTreeModel.refreshWebCacheMappings();
496	// if(Gatherer.g_man.gather_pane.workspace_tree != null) {
497	// FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel();
498	// File new_file = new File(current_file_downloading);
499	// File parent_file = new_file.getParentFile();
500	// String download_cache = Utility.getCacheDir().getAbsolutePath();
501	// ArrayList raw_path = new ArrayList();
502	// while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
503	// raw_path.add(0, parent_file.getName());
504	// parent_file = parent_file.getParentFile();
505	// }
506	// download_cache = null;
507	// // Add download cache name
508	// /** @todo - add to dictionary */
509	// raw_path.add(0, "Mirroring.Mirror_Cache");
510	// // And the root node
511	// raw_path.add(0, tree_model.getRoot());
512	// TreePath destination_path = new TreePath(raw_path.toArray());
513	// raw_path = null;
514	// // Retrieve the destination node
515	// FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
516	// // destination_path = null;
517	// //FileNode new_file_node = new FileNode(new_file);
518
519	// // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
520	// ///atherer.println("Ready to insert new FileNode.");
521	// DebugStream.println("Model: " + tree_model);
522	// DebugStream.println("Destination path: " + destination_path);
523	// destination_node.unmap();
524	// ///atherer.println("Destination node: " + destination_node);
525	// ///atherer.println("New node: " + new_file_node);
526	// //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
527
528	// //new_file_node = null;
529	// destination_node = null;
530	// tree_model = null;
531	// }
532	// url = null;
533	// current_url = null;
534	}
535
536	/** Called by the WGet native code when the requested download returns
537	* a status code other than 200.
538	*/
539	public void downloadFailed() {
540	///ystem.out.println("downloadFailed("+current_url+")");
541	failed_urls.add(current_url); // Its the current url thats failed.
542	progress.downloadFailed();
543	}
544
545	/**
546	*/
547	public void downloadWarning() {
548	progress.downloadWarning();
549	}
550
551	/**
552	* @return A String representing the currently downloading url.
553	*/
554	/* private String getCurrent() {
555	return current_url;
556	} */
557
558	/**
559	* @return A String representing the initial urls host (root node
560	* of tree that we are mirroring).
561	*/
562	public String getHost() {
563	return url.getHost();
564	}
565
566	public AppendLineOnlyFileDocument getLogDocument() {
567	return download_log;
568	}
569
570	/**
571	* @return Returns the progress bar associated with this job.
572	*/
573	public DownloadProgressBar getProgressBar() {
574	return progress;
575	}
576
577	/** Called to discover if the user wanted this thread to run or if
578	* it is paused.
579	* @return An int representing the current DownloadJob state.
580	*/
581	public int getState() {
582	return state;
583	}
584
585	/** Returns the current state of the stop flag for this job.
586	* @return A boolean representing whether the user has requested to
587	* stop.
588	*/
589	public boolean hasSignalledStop() {
590	if(state == DownloadJob.STOPPED \|\| state == DownloadJob.PAUSED \|\|
591	state == DownloadJob.COMPLETE) {
592	return true;
593	}
594	return false;
595	}
596
597	public void setState(int state) {
598	previous_state = this.state;
599	this.state = state;
600	}
601
602	/** A convinence call.
603	* @return A String representing the url of the initial url (root node of the mirrored tree).
604	*/
605	public String toString() {
606	return initial.toString();
607	}
608
609	/** Called by the WGet native code to signal the current progress of
610	* downloading.
611	* @param current A long representing the number of bytes that have
612	* been downloaded since the last update.
613	* @param expected A long representing the total number of bytes
614	* expected for this download.
615	*/
616	public void updateProgress(long current, long expected) {
617	progress.updateProgress(current, expected);
618	}
619	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: