source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 8231

Last change on this file since 8231 was 8231, checked in by mdewsnip, 20 years ago

Replaced all "Gatherer.config" with "Configuration".

  • Property svn:keywords set to Author Date Id Revision
File size: 19.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.Dictionary;
46import org.greenstone.gatherer.Gatherer;
47import org.greenstone.gatherer.WGet;
48import org.greenstone.gatherer.file.FileNode;
49import org.greenstone.gatherer.file.FileSystemModel;
50import org.greenstone.gatherer.file.WorkspaceTreeModel;
51import org.greenstone.gatherer.gui.DownloadProgressBar;
52import org.greenstone.gatherer.gui.tree.WorkspaceTree;
53import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
54import org.greenstone.gatherer.util.GURL;
55import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
56import org.greenstone.gatherer.util.Utility;
57/**
58 * @author John Thompson, Greenstone Digital Library, University of Waikato
59 * @version 2.0
60 */
61public class DownloadJob
62 implements ActionListener {
63
64 private boolean debug;
65 private boolean higher_directories;
66 private boolean no_parents;
67 private boolean other_hosts;
68 private boolean page_requisites;
69 private boolean quiet;
70
71 private AppendLineOnlyFileDocument download_log;
72
73 private DownloadProgressBar progress;
74
75 private GURL initial = null;
76 private GURL url = null;
77
78 // private TreeModel model;
79
80 private int depth;
81 private int previous_state;
82 private int state;
83
84 private String current_url;
85 private String destination;
86 private String proxy_pass;
87 private String proxy_user;
88
89 private Vector encountered_urls;
90 private Vector failed_urls;
91
92 private WGet mummy;
93
94 public static int COMPLETE = 0;
95 public static int PAUSED = 1;
96 public static int RUNNING = 2;
97 public static int STOPPED = 3;
98
99 /**
100 */
101 public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
102 // this.model = model;
103
104 String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
105 File log_file = new File(log_filename);
106 if(log_file.exists()) {
107 log_file.delete();
108 }
109 File parent_log_file = log_file.getParentFile();
110 parent_log_file.mkdirs();
111 parent_log_file = null;
112 log_file = null;
113
114 this.debug = debug;
115 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
116 this.no_parents = no_parents;
117 this.other_hosts = other_hosts;
118 this.page_requisites = page_requisites;
119 this.quiet = quiet;
120 this.initial = new GURL(initial);
121 this.depth = depth;
122 this.destination = destination;
123 this.proxy_pass = proxy_pass;
124 this.proxy_user = proxy_user;
125 this.mummy = mummy;
126
127 progress = new DownloadProgressBar(this, initial.toString(), simple);
128
129 encountered_urls = new Vector();
130 failed_urls = new Vector();
131
132 previous_state = STOPPED;
133 state = STOPPED;
134 }
135
136 /** Depending on which button on the progress bar was pushed,
137 * this method will affect the state of the DownloadJob and perhaps make
138 * calls to wget.class if necessary.
139 * @param event The ActionEvent fired from within the DownloadProgressBar
140 * which we must respond to.
141 */
142 public void actionPerformed(ActionEvent event) {
143 // The stop_start_button is used to alternately start or stop the
144 // job. If the current state of the job is paused then this
145 // restart is logically equivelent to a resume.
146 if(event.getSource() == progress.stop_start_button) {
147 previous_state = state;
148 if (state == RUNNING) {
149 state = STOPPED;
150 } else {
151 //previous_state = state;
152 state = RUNNING;
153 mummy.resumeThread();
154 }
155 }
156 else if (event.getSource() == progress.close_button) {
157 if(state == RUNNING) {
158 previous_state = state;
159 state = STOPPED; // do we need to do anything else to stop this?
160 }
161 // else {
162 mummy.deleteDownloadJob(this);
163 // }
164 }
165 }
166
167 /** Called by the WGet native code to inform us of a new download starting.
168 * @param raw_url The url that is being downloaded, as a String.
169 */
170 public void addDownload(String raw_url) {
171 if(!encountered_urls.contains(raw_url)) {
172 encountered_urls.add(raw_url);
173 }
174 // Regardless create a new GURL
175 current_url = raw_url;
176 url = new GURL(raw_url);
177 progress.addDownload(raw_url);
178 }
179
180 /** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
181 * to decide if it already knows about this url, and if not to
182 * update its progress bar.
183 * @param raw_url The url in question as a String.
184 * @param type Whether the link is an internal or external link.
185 * @return A boolean indicating if the url was added.
186 */
187 public boolean addLink(String raw_url, int type) {
188 ///ystem.out.println("addLink("+url+", "+type+")");
189 if(!encountered_urls.contains(raw_url)) {
190 // Add it to the urls we've seen.
191 encountered_urls.add(raw_url);
192 // Add it the to links for the current GURL.
193
194 // Add it to the progress file count.
195 progress.increaseFileCount();
196 return true;
197 }
198 // Regardless add it to the children links of the current GURL
199 initial.addLink(raw_url);
200
201 // We've seen it before. Don't count it again.
202 return false;
203 }
204
205 public void callWGet() {
206 // Build parameter string. Note that we never clobber, and we continue if possible
207
208 // want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
209 String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
210
211 if (no_parents) {
212 command = command + "-np ";
213 }
214 if(depth < 0) {
215 // Infinite recursion
216 command = command + "-r ";
217 }
218 else if (depth == 0) {
219 // Just this page.
220 }
221 else if (depth > 0) {
222 // Recursion to the specified depth.
223 command = command + "-r -l" + depth + " ";
224 }
225
226 String proxy_url = "";
227 // Determine if we have to use a proxy.
228 if(Configuration.get("general.use_proxy", true)) {
229 String proxy_host = Configuration.getString("general.proxy_host", true);
230 String proxy_port = Configuration.getString("general.proxy_port", true);
231 // Find out whether the user has already authenticated themselves
232 String user_pass = null;
233 String address = proxy_host + ":" + proxy_port;
234 int count = 0;
235 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
236 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
237 count++;
238 }
239 if(count >= 3) {
240 state = STOPPED;
241 return;
242 }
243 if(user_pass.indexOf("@") != -1) {
244
245 // Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
246 if (Utility.isWindows()) {
247 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
248 } else {
249 String user_name = user_pass.substring(0, user_pass.indexOf("@"));
250 String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
251 proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
252 }
253
254 }
255 else {
256 Gatherer.println("Unknown user/pass");
257 }
258 }
259
260 // The user can choose to mirror all of the page requisites...
261 if(page_requisites) {
262 command = command + "-p ";
263 }
264
265 // Download files from other hosts
266 if(other_hosts) {
267 command = command + "-H ";
268 }
269
270 // Finally tell it the site to download.
271 command = command + initial.toString();
272
273 if(previous_state == DownloadJob.COMPLETE) {
274 progress.mirrorBegun(true, true);
275 }
276 else {
277 progress.mirrorBegun(false, true);
278 }
279
280 File dest_file = new File(destination);
281 if (!dest_file.exists()) {
282 dest_file.mkdirs();
283 }
284 // Run it
285 try {
286 //Gatherer.println("Cmd: " + command); // don't print it out cos it may have the password in it
287 Runtime rt = Runtime.getRuntime();
288 String [] env = null;
289 if (!proxy_url.equals("")) {
290 env = new String[2];
291 env[0] = "http_proxy=http://"+proxy_url;
292 env[1] = "ftp_proxy=ftp://"+proxy_url;
293 }
294 Process prcs = rt.exec(command, env, dest_file);
295 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
296 BufferedReader br = new BufferedReader(isr);
297 // Capture the standard error stream and seach for two particular occurances.
298 String line;
299 boolean ignore_for_robots = false;
300 while ((line = br.readLine()) != null && state != STOPPED) {
301
302 Gatherer.println(line);
303 download_log.appendLine(line);
304 // The first magic special test is to see if we've just
305 // asked for the robots.txt file. If so we ignore
306 // the next add and then the next complete/error.
307 if(line.lastIndexOf("robots.txt;") != -1) {
308 Gatherer.println("***** Requesting robot.txt");
309 ignore_for_robots = true;
310 }
311 // If line contains "=> `" display text as the
312 // currently downloading url. Unique to add download.
313 else if(line.lastIndexOf("=> `") != -1) {
314 if(!ignore_for_robots) {
315 // Add download
316 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
317 addDownload("http:/" + new_url);
318 }
319 }
320 // If line contains "/s) - `" set currently
321 // downloading url to "Download Complete".
322 else if(line.lastIndexOf("/s) - `") != -1) {
323 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
324 if(!ignore_for_robots) {
325 Gatherer.println("Not ignore for robots");
326 // Download complete
327 downloadComplete(current_file_downloading);
328 }
329 else {
330 Gatherer.println("Ignore for robots");
331 ignore_for_robots = false;
332 }
333 }
334 // The already there line begins "File `..." However this
335 // is only true in english, so instead I looked and there
336 // are few (if any at all) other messages than those above
337 // and not overwriting messages that use " `" so we'll
338 // look for that. Note this method is not guarenteed to be
339 // unique like the previous two.
340 else if(line.lastIndexOf(" `") != -1) {
341 // Not Overwriting
342 Gatherer.println("Already there.");
343 String new_url =
344 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
345 addDownload("http:/" + new_url);
346 downloadWarning();
347 }
348 // Any other important message starts with the time in the form hh:mm:ss
349 else if(line.length() > 7) {
350 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
351 if(!ignore_for_robots) {
352 Gatherer.println("Error.");
353 downloadFailed();
354 }
355 else {
356 ignore_for_robots = false;
357 }
358 }
359 }
360 }
361 if(state == STOPPED) {
362 isr.close();
363 prcs.destroy(); // This doesn't always work, but it's worth a try
364 }
365 else {
366 // Now display final message based on exit value
367 prcs.waitFor();
368 }
369 }
370 catch (Exception ioe) {
371 //message(Utility.ERROR, ioe.toString());
372 Gatherer.printStackTrace(ioe);
373 }
374 // If we've got to here and the state isn't STOPPED then the
375 // job is complete.
376 if(state == DownloadJob.RUNNING) {
377 progress.mirrorComplete();
378 previous_state = state;
379 state = DownloadJob.COMPLETE;
380
381 }
382 // refresh the workspace tree
383 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
384 }
385
386
387 /** The most important part of the DownloadJob class, this method is
388 * responsible for calling the WGet native methods used to
389 * mirror the indicated url. By this stage all the variables
390 * necessary should be set and we need only build up the
391 * parameter string and make the call.
392 */
393 public void callWGetNative() {
394 Vector args = new Vector();
395
396 // Let the DownloadProgressBar know we're starting, just in case
397 // the user hasn't told us to. If this is the second time the
398 // urls downloaded and the first attempt was successful (ie
399 // the previous job was complete), then we have the case where
400 // the user is forcing us to remirror. Reset all the values etc
401 // if this is the case then reset the variables.
402 // Note that this can cause the result line to look something
403 // like this.
404 // Downloaded 12 of 12 files (8 warnings, 0 errors).
405 // The warnings would be something like, 'File already downloaded'
406 // but the total number of files and the file successfully
407 // downloaded will be correct.
408 if(previous_state == DownloadJob.COMPLETE) {
409 progress.mirrorBegun(true, false);
410 }
411 else {
412 progress.mirrorBegun(false, false);
413 }
414
415 // Parse arguments into array.
416 args.add(Utility.BASE_DIR + "wget");
417 args.add("-d");
418 args.add("-o");
419 args.add("debug.txt");
420
421 if(destination != null) {
422 args.add("-P");
423 args.add(destination);
424 }
425
426 if(depth < 0) {
427 // Infinite recursion
428 args.add("-r");
429 }
430 else if (depth == 0) {
431 // Just this page.
432 }
433 else if (depth > 0) {
434 // Recursion to the specified depth.
435 args.add("-r");
436 args.add("-l");
437 args.add("" + depth + ""); // Hacky
438 }
439
440 if(previous_state == PAUSED) {
441 args.add("-nc");
442 args.add("-c");
443 }
444
445 if(proxy_user != null) {
446 args.add("--proxy-user=" + proxy_user);
447 args.add("--proxy-passwd=" + proxy_pass);
448 }
449
450 if(page_requisites) {
451 args.add("-p");
452 }
453
454 if(quiet) {
455 args.add("-q");
456 }
457
458 if(other_hosts) {
459 args.add("-H");
460 }
461
462 args.add(initial.toString());
463
464 Gatherer.println("Calling wget ");
465 for(Enumeration e = args.elements(); e.hasMoreElements();) {
466 Gatherer.println(e.nextElement() + " ");
467 }
468 Gatherer.println("");
469
470 // Run home to mummy.
471 int value = mummy.wget(args.size(), args.toArray(), debug);
472
473 // If we've got to here and the state isn't STOPPED then the job is complete.
474 if(state == RUNNING) {
475 progress.mirrorComplete();
476 previous_state = state;
477 state = COMPLETE;
478 }
479 }
480
481 /** Called by the WGet native code when the current download is
482 * completed. In turn all download listeners are informed.
483 */
484 public void downloadComplete() {
485 progress.downloadComplete();
486 url = null;
487 current_url = null;
488 }
489
490 public void downloadComplete(String current_file_downloading) {
491 progress.downloadComplete();
492 Gatherer.println("Current File: " + current_file_downloading);
493 // !! TEMPORARILY DISABLED !!
494 //WorkspaceTreeModel.refreshWebCacheMappings();
495// if(Gatherer.g_man.gather_pane.workspace_tree != null) {
496// FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel();
497// File new_file = new File(current_file_downloading);
498// File parent_file = new_file.getParentFile();
499// String download_cache = Utility.getCacheDir().getAbsolutePath();
500// ArrayList raw_path = new ArrayList();
501// while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
502// raw_path.add(0, parent_file.getName());
503// parent_file = parent_file.getParentFile();
504// }
505// download_cache = null;
506// // Add download cache name
507// /** @todo - add to dictionary */
508// raw_path.add(0, "Mirroring.Mirror_Cache");
509// // And the root node
510// raw_path.add(0, tree_model.getRoot());
511// TreePath destination_path = new TreePath(raw_path.toArray());
512// raw_path = null;
513// // Retrieve the destination node
514// FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
515// // destination_path = null;
516// //FileNode new_file_node = new FileNode(new_file);
517
518// // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
519// ///atherer.println("Ready to insert new FileNode.");
520// Gatherer.println("Model: " + tree_model);
521// Gatherer.println("Destination path: " + destination_path);
522// destination_node.unmap();
523// ///atherer.println("Destination node: " + destination_node);
524// ///atherer.println("New node: " + new_file_node);
525// //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
526
527// //new_file_node = null;
528// destination_node = null;
529// tree_model = null;
530// }
531// url = null;
532// current_url = null;
533 }
534
535 /** Called by the WGet native code when the requested download returns
536 * a status code other than 200.
537 */
538 public void downloadFailed() {
539 ///ystem.out.println("downloadFailed("+current_url+")");
540 failed_urls.add(current_url); // Its the current url thats failed.
541 progress.downloadFailed();
542 }
543
544 /**
545 */
546 public void downloadWarning() {
547 progress.downloadWarning();
548 }
549
550 /**
551 * @return A String representing the currently downloading url.
552 */
553 /* private String getCurrent() {
554 return current_url;
555 } */
556
557 /**
558 * @return A String representing the initial urls host (root node
559 * of tree that we are mirroring).
560 */
561 public String getHost() {
562 return url.getHost();
563 }
564
565 public AppendLineOnlyFileDocument getLogDocument() {
566 return download_log;
567 }
568
569 /**
570 * @return Returns the progress bar associated with this job.
571 */
572 public DownloadProgressBar getProgressBar() {
573 return progress;
574 }
575
576 /** Called to discover if the user wanted this thread to run or if
577 * it is paused.
578 * @return An int representing the current DownloadJob state.
579 */
580 public int getState() {
581 return state;
582 }
583
584 /** Returns the current state of the stop flag for this job.
585 * @return A boolean representing whether the user has requested to
586 * stop.
587 */
588 public boolean hasSignalledStop() {
589 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
590 state == DownloadJob.COMPLETE) {
591 return true;
592 }
593 return false;
594 }
595
596 public void setState(int state) {
597 previous_state = this.state;
598 this.state = state;
599 }
600
601 /** A convinence call.
602 * @return A String representing the url of the initial url (root node of the mirrored tree).
603 */
604 public String toString() {
605 return initial.toString();
606 }
607
608 /** Called by the WGet native code to signal the current progress of
609 * downloading.
610 * @param current A long representing the number of bytes that have
611 * been downloaded since the last update.
612 * @param expected A long representing the total number of bytes
613 * expected for this download.
614 */
615 public void updateProgress(long current, long expected) {
616 progress.updateProgress(current, expected);
617 }
618}
Note: See TracBrowser for help on using the repository browser.