Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 8231

Last change on this file since 8231 was 8231, checked in by mdewsnip, 20 years ago
Replaced all "Gatherer.config" with "Configuration".
Property svn:keywords set to `Author Date Id Revision`
File size: 19.7 KB

Line
1	/**
2	*#########################################################################
3	*
4	* A component of the Gatherer application, part of the Greenstone digital
5	* library suite from the New Zealand Digital Library Project at the
6	* University of Waikato, New Zealand.
7	*
8	* <BR><BR>
9	*
10	* Author: John Thompson, Greenstone Digital Library, University of Waikato
11	*
12	* <BR><BR>
13	*
14	* Copyright (C) 1999 New Zealand Digital Library Project
15	*
16	* <BR><BR>
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* <BR><BR>
24	*
25	* This program is distributed in the hope that it will be useful,
26	* but WITHOUT ANY WARRANTY; without even the implied warranty of
27	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28	* GNU General Public License for more details.
29	*
30	* <BR><BR>
31	*
32	* You should have received a copy of the GNU General Public License
33	* along with this program; if not, write to the Free Software
34	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35	*########################################################################
36	*/
37	package org.greenstone.gatherer.collection;
38
39	import java.awt.event.*;
40	import java.io.*;
41	import java.net.*;
42	import java.util.*;
43	import javax.swing.tree.*;
44	import org.greenstone.gatherer.Configuration;
45	import org.greenstone.gatherer.Dictionary;
46	import org.greenstone.gatherer.Gatherer;
47	import org.greenstone.gatherer.WGet;
48	import org.greenstone.gatherer.file.FileNode;
49	import org.greenstone.gatherer.file.FileSystemModel;
50	import org.greenstone.gatherer.file.WorkspaceTreeModel;
51	import org.greenstone.gatherer.gui.DownloadProgressBar;
52	import org.greenstone.gatherer.gui.tree.WorkspaceTree;
53	import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
54	import org.greenstone.gatherer.util.GURL;
55	import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
56	import org.greenstone.gatherer.util.Utility;
57	/**
58	* @author John Thompson, Greenstone Digital Library, University of Waikato
59	* @version 2.0
60	*/
61	public class DownloadJob
62	implements ActionListener {
63
64	private boolean debug;
65	private boolean higher_directories;
66	private boolean no_parents;
67	private boolean other_hosts;
68	private boolean page_requisites;
69	private boolean quiet;
70
71	private AppendLineOnlyFileDocument download_log;
72
73	private DownloadProgressBar progress;
74
75	private GURL initial = null;
76	private GURL url = null;
77
78	// private TreeModel model;
79
80	private int depth;
81	private int previous_state;
82	private int state;
83
84	private String current_url;
85	private String destination;
86	private String proxy_pass;
87	private String proxy_user;
88
89	private Vector encountered_urls;
90	private Vector failed_urls;
91
92	private WGet mummy;
93
94	public static int COMPLETE = 0;
95	public static int PAUSED = 1;
96	public static int RUNNING = 2;
97	public static int STOPPED = 3;
98
99	/**
100	*/
101	public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
102	// this.model = model;
103
104	String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
105	File log_file = new File(log_filename);
106	if(log_file.exists()) {
107	log_file.delete();
108	}
109	File parent_log_file = log_file.getParentFile();
110	parent_log_file.mkdirs();
111	parent_log_file = null;
112	log_file = null;
113
114	this.debug = debug;
115	this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
116	this.no_parents = no_parents;
117	this.other_hosts = other_hosts;
118	this.page_requisites = page_requisites;
119	this.quiet = quiet;
120	this.initial = new GURL(initial);
121	this.depth = depth;
122	this.destination = destination;
123	this.proxy_pass = proxy_pass;
124	this.proxy_user = proxy_user;
125	this.mummy = mummy;
126
127	progress = new DownloadProgressBar(this, initial.toString(), simple);
128
129	encountered_urls = new Vector();
130	failed_urls = new Vector();
131
132	previous_state = STOPPED;
133	state = STOPPED;
134	}
135
136	/** Depending on which button on the progress bar was pushed,
137	* this method will affect the state of the DownloadJob and perhaps make
138	* calls to wget.class if necessary.
139	* @param event The ActionEvent fired from within the DownloadProgressBar
140	* which we must respond to.
141	*/
142	public void actionPerformed(ActionEvent event) {
143	// The stop_start_button is used to alternately start or stop the
144	// job. If the current state of the job is paused then this
145	// restart is logically equivelent to a resume.
146	if(event.getSource() == progress.stop_start_button) {
147	previous_state = state;
148	if (state == RUNNING) {
149	state = STOPPED;
150	} else {
151	//previous_state = state;
152	state = RUNNING;
153	mummy.resumeThread();
154	}
155	}
156	else if (event.getSource() == progress.close_button) {
157	if(state == RUNNING) {
158	previous_state = state;
159	state = STOPPED; // do we need to do anything else to stop this?
160	}
161	// else {
162	mummy.deleteDownloadJob(this);
163	// }
164	}
165	}
166
167	/** Called by the WGet native code to inform us of a new download starting.
168	* @param raw_url The url that is being downloaded, as a String.
169	*/
170	public void addDownload(String raw_url) {
171	if(!encountered_urls.contains(raw_url)) {
172	encountered_urls.add(raw_url);
173	}
174	// Regardless create a new GURL
175	current_url = raw_url;
176	url = new GURL(raw_url);
177	progress.addDownload(raw_url);
178	}
179
180	/** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
181	* to decide if it already knows about this url, and if not to
182	* update its progress bar.
183	* @param raw_url The url in question as a String.
184	* @param type Whether the link is an internal or external link.
185	* @return A boolean indicating if the url was added.
186	*/
187	public boolean addLink(String raw_url, int type) {
188	///ystem.out.println("addLink("+url+", "+type+")");
189	if(!encountered_urls.contains(raw_url)) {
190	// Add it to the urls we've seen.
191	encountered_urls.add(raw_url);
192	// Add it the to links for the current GURL.
193
194	// Add it to the progress file count.
195	progress.increaseFileCount();
196	return true;
197	}
198	// Regardless add it to the children links of the current GURL
199	initial.addLink(raw_url);
200
201	// We've seen it before. Don't count it again.
202	return false;
203	}
204
205	public void callWGet() {
206	// Build parameter string. Note that we never clobber, and we continue if possible
207
208	// want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
209	String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
210
211	if (no_parents) {
212	command = command + "-np ";
213	}
214	if(depth < 0) {
215	// Infinite recursion
216	command = command + "-r ";
217	}
218	else if (depth == 0) {
219	// Just this page.
220	}
221	else if (depth > 0) {
222	// Recursion to the specified depth.
223	command = command + "-r -l" + depth + " ";
224	}
225
226	String proxy_url = "";
227	// Determine if we have to use a proxy.
228	if(Configuration.get("general.use_proxy", true)) {
229	String proxy_host = Configuration.getString("general.proxy_host", true);
230	String proxy_port = Configuration.getString("general.proxy_port", true);
231	// Find out whether the user has already authenticated themselves
232	String user_pass = null;
233	String address = proxy_host + ":" + proxy_port;
234	int count = 0;
235	while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
236	Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
237	count++;
238	}
239	if(count >= 3) {
240	state = STOPPED;
241	return;
242	}
243	if(user_pass.indexOf("@") != -1) {
244
245	// Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
246	if (Utility.isWindows()) {
247	command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
248	} else {
249	String user_name = user_pass.substring(0, user_pass.indexOf("@"));
250	String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
251	proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
252	}
253
254	}
255	else {
256	Gatherer.println("Unknown user/pass");
257	}
258	}
259
260	// The user can choose to mirror all of the page requisites...
261	if(page_requisites) {
262	command = command + "-p ";
263	}
264
265	// Download files from other hosts
266	if(other_hosts) {
267	command = command + "-H ";
268	}
269
270	// Finally tell it the site to download.
271	command = command + initial.toString();
272
273	if(previous_state == DownloadJob.COMPLETE) {
274	progress.mirrorBegun(true, true);
275	}
276	else {
277	progress.mirrorBegun(false, true);
278	}
279
280	File dest_file = new File(destination);
281	if (!dest_file.exists()) {
282	dest_file.mkdirs();
283	}
284	// Run it
285	try {
286	//Gatherer.println("Cmd: " + command); // don't print it out cos it may have the password in it
287	Runtime rt = Runtime.getRuntime();
288	String [] env = null;
289	if (!proxy_url.equals("")) {
290	env = new String[2];
291	env[0] = "http_proxy=http://"+proxy_url;
292	env[1] = "ftp_proxy=ftp://"+proxy_url;
293	}
294	Process prcs = rt.exec(command, env, dest_file);
295	InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
296	BufferedReader br = new BufferedReader(isr);
297	// Capture the standard error stream and seach for two particular occurances.
298	String line;
299	boolean ignore_for_robots = false;
300	while ((line = br.readLine()) != null && state != STOPPED) {
301
302	Gatherer.println(line);
303	download_log.appendLine(line);
304	// The first magic special test is to see if we've just
305	// asked for the robots.txt file. If so we ignore
306	// the next add and then the next complete/error.
307	if(line.lastIndexOf("robots.txt;") != -1) {
308	Gatherer.println("***** Requesting robot.txt");
309	ignore_for_robots = true;
310	}
311	// If line contains "=> `" display text as the
312	// currently downloading url. Unique to add download.
313	else if(line.lastIndexOf("=> `") != -1) {
314	if(!ignore_for_robots) {
315	// Add download
316	String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
317	addDownload("http:/" + new_url);
318	}
319	}
320	// If line contains "/s) - `" set currently
321	// downloading url to "Download Complete".
322	else if(line.lastIndexOf("/s) - `") != -1) {
323	String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
324	if(!ignore_for_robots) {
325	Gatherer.println("Not ignore for robots");
326	// Download complete
327	downloadComplete(current_file_downloading);
328	}
329	else {
330	Gatherer.println("Ignore for robots");
331	ignore_for_robots = false;
332	}
333	}
334	// The already there line begins "File `..." However this
335	// is only true in english, so instead I looked and there
336	// are few (if any at all) other messages than those above
337	// and not overwriting messages that use " `" so we'll
338	// look for that. Note this method is not guarenteed to be
339	// unique like the previous two.
340	else if(line.lastIndexOf(" `") != -1) {
341	// Not Overwriting
342	Gatherer.println("Already there.");
343	String new_url =
344	line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
345	addDownload("http:/" + new_url);
346	downloadWarning();
347	}
348	// Any other important message starts with the time in the form hh:mm:ss
349	else if(line.length() > 7) {
350	if(line.charAt(2) == ':' && line.charAt(5) == ':') {
351	if(!ignore_for_robots) {
352	Gatherer.println("Error.");
353	downloadFailed();
354	}
355	else {
356	ignore_for_robots = false;
357	}
358	}
359	}
360	}
361	if(state == STOPPED) {
362	isr.close();
363	prcs.destroy(); // This doesn't always work, but it's worth a try
364	}
365	else {
366	// Now display final message based on exit value
367	prcs.waitFor();
368	}
369	}
370	catch (Exception ioe) {
371	//message(Utility.ERROR, ioe.toString());
372	Gatherer.printStackTrace(ioe);
373	}
374	// If we've got to here and the state isn't STOPPED then the
375	// job is complete.
376	if(state == DownloadJob.RUNNING) {
377	progress.mirrorComplete();
378	previous_state = state;
379	state = DownloadJob.COMPLETE;
380
381	}
382	// refresh the workspace tree
383	Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
384	}
385
386
387	/** The most important part of the DownloadJob class, this method is
388	* responsible for calling the WGet native methods used to
389	* mirror the indicated url. By this stage all the variables
390	* necessary should be set and we need only build up the
391	* parameter string and make the call.
392	*/
393	public void callWGetNative() {
394	Vector args = new Vector();
395
396	// Let the DownloadProgressBar know we're starting, just in case
397	// the user hasn't told us to. If this is the second time the
398	// urls downloaded and the first attempt was successful (ie
399	// the previous job was complete), then we have the case where
400	// the user is forcing us to remirror. Reset all the values etc
401	// if this is the case then reset the variables.
402	// Note that this can cause the result line to look something
403	// like this.
404	// Downloaded 12 of 12 files (8 warnings, 0 errors).
405	// The warnings would be something like, 'File already downloaded'
406	// but the total number of files and the file successfully
407	// downloaded will be correct.
408	if(previous_state == DownloadJob.COMPLETE) {
409	progress.mirrorBegun(true, false);
410	}
411	else {
412	progress.mirrorBegun(false, false);
413	}
414
415	// Parse arguments into array.
416	args.add(Utility.BASE_DIR + "wget");
417	args.add("-d");
418	args.add("-o");
419	args.add("debug.txt");
420
421	if(destination != null) {
422	args.add("-P");
423	args.add(destination);
424	}
425
426	if(depth < 0) {
427	// Infinite recursion
428	args.add("-r");
429	}
430	else if (depth == 0) {
431	// Just this page.
432	}
433	else if (depth > 0) {
434	// Recursion to the specified depth.
435	args.add("-r");
436	args.add("-l");
437	args.add("" + depth + ""); // Hacky
438	}
439
440	if(previous_state == PAUSED) {
441	args.add("-nc");
442	args.add("-c");
443	}
444
445	if(proxy_user != null) {
446	args.add("--proxy-user=" + proxy_user);
447	args.add("--proxy-passwd=" + proxy_pass);
448	}
449
450	if(page_requisites) {
451	args.add("-p");
452	}
453
454	if(quiet) {
455	args.add("-q");
456	}
457
458	if(other_hosts) {
459	args.add("-H");
460	}
461
462	args.add(initial.toString());
463
464	Gatherer.println("Calling wget ");
465	for(Enumeration e = args.elements(); e.hasMoreElements();) {
466	Gatherer.println(e.nextElement() + " ");
467	}
468	Gatherer.println("");
469
470	// Run home to mummy.
471	int value = mummy.wget(args.size(), args.toArray(), debug);
472
473	// If we've got to here and the state isn't STOPPED then the job is complete.
474	if(state == RUNNING) {
475	progress.mirrorComplete();
476	previous_state = state;
477	state = COMPLETE;
478	}
479	}
480
481	/** Called by the WGet native code when the current download is
482	* completed. In turn all download listeners are informed.
483	*/
484	public void downloadComplete() {
485	progress.downloadComplete();
486	url = null;
487	current_url = null;
488	}
489
490	public void downloadComplete(String current_file_downloading) {
491	progress.downloadComplete();
492	Gatherer.println("Current File: " + current_file_downloading);
493	// !! TEMPORARILY DISABLED !!
494	//WorkspaceTreeModel.refreshWebCacheMappings();
495	// if(Gatherer.g_man.gather_pane.workspace_tree != null) {
496	// FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel();
497	// File new_file = new File(current_file_downloading);
498	// File parent_file = new_file.getParentFile();
499	// String download_cache = Utility.getCacheDir().getAbsolutePath();
500	// ArrayList raw_path = new ArrayList();
501	// while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
502	// raw_path.add(0, parent_file.getName());
503	// parent_file = parent_file.getParentFile();
504	// }
505	// download_cache = null;
506	// // Add download cache name
507	// /** @todo - add to dictionary */
508	// raw_path.add(0, "Mirroring.Mirror_Cache");
509	// // And the root node
510	// raw_path.add(0, tree_model.getRoot());
511	// TreePath destination_path = new TreePath(raw_path.toArray());
512	// raw_path = null;
513	// // Retrieve the destination node
514	// FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
515	// // destination_path = null;
516	// //FileNode new_file_node = new FileNode(new_file);
517
518	// // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
519	// ///atherer.println("Ready to insert new FileNode.");
520	// Gatherer.println("Model: " + tree_model);
521	// Gatherer.println("Destination path: " + destination_path);
522	// destination_node.unmap();
523	// ///atherer.println("Destination node: " + destination_node);
524	// ///atherer.println("New node: " + new_file_node);
525	// //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
526
527	// //new_file_node = null;
528	// destination_node = null;
529	// tree_model = null;
530	// }
531	// url = null;
532	// current_url = null;
533	}
534
535	/** Called by the WGet native code when the requested download returns
536	* a status code other than 200.
537	*/
538	public void downloadFailed() {
539	///ystem.out.println("downloadFailed("+current_url+")");
540	failed_urls.add(current_url); // Its the current url thats failed.
541	progress.downloadFailed();
542	}
543
544	/**
545	*/
546	public void downloadWarning() {
547	progress.downloadWarning();
548	}
549
550	/**
551	* @return A String representing the currently downloading url.
552	*/
553	/* private String getCurrent() {
554	return current_url;
555	} */
556
557	/**
558	* @return A String representing the initial urls host (root node
559	* of tree that we are mirroring).
560	*/
561	public String getHost() {
562	return url.getHost();
563	}
564
565	public AppendLineOnlyFileDocument getLogDocument() {
566	return download_log;
567	}
568
569	/**
570	* @return Returns the progress bar associated with this job.
571	*/
572	public DownloadProgressBar getProgressBar() {
573	return progress;
574	}
575
576	/** Called to discover if the user wanted this thread to run or if
577	* it is paused.
578	* @return An int representing the current DownloadJob state.
579	*/
580	public int getState() {
581	return state;
582	}
583
584	/** Returns the current state of the stop flag for this job.
585	* @return A boolean representing whether the user has requested to
586	* stop.
587	*/
588	public boolean hasSignalledStop() {
589	if(state == DownloadJob.STOPPED \|\| state == DownloadJob.PAUSED \|\|
590	state == DownloadJob.COMPLETE) {
591	return true;
592	}
593	return false;
594	}
595
596	public void setState(int state) {
597	previous_state = this.state;
598	this.state = state;
599	}
600
601	/** A convinence call.
602	* @return A String representing the url of the initial url (root node of the mirrored tree).
603	*/
604	public String toString() {
605	return initial.toString();
606	}
607
608	/** Called by the WGet native code to signal the current progress of
609	* downloading.
610	* @param current A long representing the number of bytes that have
611	* been downloaded since the last update.
612	* @param expected A long representing the total number of bytes
613	* expected for this download.
614	*/
615	public void updateProgress(long current, long expected) {
616	progress.updateProgress(current, expected);
617	}
618	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: