source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 10009

Last change on this file since 10009 was 8811, checked in by mdewsnip, 19 years ago

Minor tidy ups.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.5 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.WGet;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.gui.DownloadProgressBar;
52import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
53import org.greenstone.gatherer.util.GURL;
54import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
55import org.greenstone.gatherer.util.Utility;
56/**
57 * @author John Thompson, Greenstone Digital Library, University of Waikato
58 * @version 2.0
59 */
60public class DownloadJob
61 implements ActionListener {
62
63 private boolean debug;
64 private boolean higher_directories;
65 private boolean no_parents;
66 private boolean other_hosts;
67 private boolean page_requisites;
68 private boolean quiet;
69
70 private AppendLineOnlyFileDocument download_log;
71
72 private DownloadProgressBar progress;
73
74 private GURL initial = null;
75 private GURL url = null;
76
77 // private TreeModel model;
78
79 private int depth;
80 private int previous_state;
81 private int state;
82
83 private String current_url;
84 private String destination;
85 private String proxy_pass;
86 private String proxy_user;
87
88 private Vector encountered_urls;
89 private Vector failed_urls;
90
91 private WGet mummy;
92
93 public static int COMPLETE = 0;
94 public static int PAUSED = 1;
95 public static int RUNNING = 2;
96 public static int STOPPED = 3;
97
98 /**
99 */
100 public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
101 // this.model = model;
102
103 String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
104 File log_file = new File(log_filename);
105 if(log_file.exists()) {
106 log_file.delete();
107 }
108 File parent_log_file = log_file.getParentFile();
109 parent_log_file.mkdirs();
110 parent_log_file = null;
111 log_file = null;
112
113 this.debug = debug;
114 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
115 this.no_parents = no_parents;
116 this.other_hosts = other_hosts;
117 this.page_requisites = page_requisites;
118 this.quiet = quiet;
119 this.initial = new GURL(initial);
120 this.depth = depth;
121 this.destination = destination;
122 this.proxy_pass = proxy_pass;
123 this.proxy_user = proxy_user;
124 this.mummy = mummy;
125
126 progress = new DownloadProgressBar(this, initial.toString(), simple);
127
128 encountered_urls = new Vector();
129 failed_urls = new Vector();
130
131 previous_state = STOPPED;
132 state = STOPPED;
133 }
134
135 /** Depending on which button on the progress bar was pushed,
136 * this method will affect the state of the DownloadJob and perhaps make
137 * calls to wget.class if necessary.
138 * @param event The ActionEvent fired from within the DownloadProgressBar
139 * which we must respond to.
140 */
141 public void actionPerformed(ActionEvent event) {
142 // The stop_start_button is used to alternately start or stop the
143 // job. If the current state of the job is paused then this
144 // restart is logically equivelent to a resume.
145 if(event.getSource() == progress.stop_start_button) {
146 previous_state = state;
147 if (state == RUNNING) {
148 state = STOPPED;
149 } else {
150 //previous_state = state;
151 state = RUNNING;
152 mummy.resumeThread();
153 }
154 }
155 else if (event.getSource() == progress.close_button) {
156 if(state == RUNNING) {
157 previous_state = state;
158 state = STOPPED; // do we need to do anything else to stop this?
159 }
160 // else {
161 mummy.deleteDownloadJob(this);
162 // }
163 }
164 }
165
166 /** Called by the WGet native code to inform us of a new download starting.
167 * @param raw_url The url that is being downloaded, as a String.
168 */
169 public void addDownload(String raw_url) {
170 if(!encountered_urls.contains(raw_url)) {
171 encountered_urls.add(raw_url);
172 }
173 // Regardless create a new GURL
174 current_url = raw_url;
175 url = new GURL(raw_url);
176 progress.addDownload(raw_url);
177 }
178
179 /** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
180 * to decide if it already knows about this url, and if not to
181 * update its progress bar.
182 * @param raw_url The url in question as a String.
183 * @param type Whether the link is an internal or external link.
184 * @return A boolean indicating if the url was added.
185 */
186 public boolean addLink(String raw_url, int type) {
187 ///ystem.out.println("addLink("+url+", "+type+")");
188 if(!encountered_urls.contains(raw_url)) {
189 // Add it to the urls we've seen.
190 encountered_urls.add(raw_url);
191 // Add it the to links for the current GURL.
192
193 // Add it to the progress file count.
194 progress.increaseFileCount();
195 return true;
196 }
197 // Regardless add it to the children links of the current GURL
198 initial.addLink(raw_url);
199
200 // We've seen it before. Don't count it again.
201 return false;
202 }
203
204 public void callWGet() {
205 // Build parameter string. Note that we never clobber, and we continue if possible
206
207 // want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
208 String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
209
210 if (no_parents) {
211 command = command + "-np ";
212 }
213 if(depth < 0) {
214 // Infinite recursion
215 command = command + "-r ";
216 }
217 else if (depth == 0) {
218 // Just this page.
219 }
220 else if (depth > 0) {
221 // Recursion to the specified depth.
222 command = command + "-r -l" + depth + " ";
223 }
224
225 String proxy_url = "";
226 // Determine if we have to use a proxy.
227 if(Configuration.get("general.use_proxy", true)) {
228 String proxy_host = Configuration.getString("general.proxy_host", true);
229 String proxy_port = Configuration.getString("general.proxy_port", true);
230 // Find out whether the user has already authenticated themselves
231 String user_pass = null;
232 String address = proxy_host + ":" + proxy_port;
233 int count = 0;
234 while(count < 3 && (user_pass = (String) GAuthenticator.authentications.get(address)) == null) {
235 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
236 count++;
237 }
238 if(count >= 3) {
239 state = STOPPED;
240 return;
241 }
242 if(user_pass.indexOf("@") != -1) {
243
244 // Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
245 if (Utility.isWindows()) {
246 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
247 } else {
248 String user_name = user_pass.substring(0, user_pass.indexOf("@"));
249 String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
250 proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
251 }
252
253 }
254 else {
255 DebugStream.println("Unknown user/pass");
256 }
257 }
258
259 // The user can choose to mirror all of the page requisites...
260 if(page_requisites) {
261 command = command + "-p ";
262 }
263
264 // Download files from other hosts
265 if(other_hosts) {
266 command = command + "-H ";
267 }
268
269 // Finally tell it the site to download.
270 command = command + initial.toString();
271
272 if(previous_state == DownloadJob.COMPLETE) {
273 progress.mirrorBegun(true, true);
274 }
275 else {
276 progress.mirrorBegun(false, true);
277 }
278
279 File dest_file = new File(destination);
280 if (!dest_file.exists()) {
281 dest_file.mkdirs();
282 }
283 // Run it
284 try {
285 //DebugStream.println("Cmd: " + command); // don't print it out cos it may have the password in it
286 Runtime rt = Runtime.getRuntime();
287 String [] env = null;
288 if (!proxy_url.equals("")) {
289 env = new String[2];
290 env[0] = "http_proxy=http://"+proxy_url;
291 env[1] = "ftp_proxy=ftp://"+proxy_url;
292 }
293 Process prcs = rt.exec(command, env, dest_file);
294 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
295 BufferedReader br = new BufferedReader(isr);
296 // Capture the standard error stream and seach for two particular occurances.
297 String line;
298 boolean ignore_for_robots = false;
299 while ((line = br.readLine()) != null && state != STOPPED) {
300
301 DebugStream.println(line);
302 download_log.appendLine(line);
303 // The first magic special test is to see if we've just
304 // asked for the robots.txt file. If so we ignore
305 // the next add and then the next complete/error.
306 if(line.lastIndexOf("robots.txt;") != -1) {
307 DebugStream.println("***** Requesting robot.txt");
308 ignore_for_robots = true;
309 }
310 // If line contains "=> `" display text as the
311 // currently downloading url. Unique to add download.
312 else if(line.lastIndexOf("=> `") != -1) {
313 if(!ignore_for_robots) {
314 // Add download
315 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
316 addDownload("http:/" + new_url);
317 }
318 }
319 // If line contains "/s) - `" set currently
320 // downloading url to "Download Complete".
321 else if(line.lastIndexOf("/s) - `") != -1) {
322 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
323 if(!ignore_for_robots) {
324 DebugStream.println("Not ignore for robots");
325 // Download complete
326 downloadComplete(current_file_downloading);
327 }
328 else {
329 DebugStream.println("Ignore for robots");
330 ignore_for_robots = false;
331 }
332 }
333 // The already there line begins "File `..." However this
334 // is only true in english, so instead I looked and there
335 // are few (if any at all) other messages than those above
336 // and not overwriting messages that use " `" so we'll
337 // look for that. Note this method is not guarenteed to be
338 // unique like the previous two.
339 else if(line.lastIndexOf(" `") != -1) {
340 // Not Overwriting
341 DebugStream.println("Already there.");
342 String new_url =
343 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
344 addDownload("http:/" + new_url);
345 downloadWarning();
346 }
347 // Any other important message starts with the time in the form hh:mm:ss
348 else if(line.length() > 7) {
349 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
350 if(!ignore_for_robots) {
351 DebugStream.println("Error.");
352 downloadFailed();
353 }
354 else {
355 ignore_for_robots = false;
356 }
357 }
358 }
359 }
360 if(state == STOPPED) {
361 isr.close();
362 prcs.destroy(); // This doesn't always work, but it's worth a try
363 }
364 else {
365 // Now display final message based on exit value
366 prcs.waitFor();
367 }
368 }
369 catch (Exception ioe) {
370 //message(Utility.ERROR, ioe.toString());
371 DebugStream.printStackTrace(ioe);
372 }
373 // If we've got to here and the state isn't STOPPED then the
374 // job is complete.
375 if(state == DownloadJob.RUNNING) {
376 progress.mirrorComplete();
377 previous_state = state;
378 state = DownloadJob.COMPLETE;
379
380 }
381 // refresh the workspace tree
382 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
383 }
384
385
386 /** The most important part of the DownloadJob class, this method is
387 * responsible for calling the WGet native methods used to
388 * mirror the indicated url. By this stage all the variables
389 * necessary should be set and we need only build up the
390 * parameter string and make the call.
391 */
392 public void callWGetNative() {
393 Vector args = new Vector();
394
395 // Let the DownloadProgressBar know we're starting, just in case
396 // the user hasn't told us to. If this is the second time the
397 // urls downloaded and the first attempt was successful (ie
398 // the previous job was complete), then we have the case where
399 // the user is forcing us to remirror. Reset all the values etc
400 // if this is the case then reset the variables.
401 // Note that this can cause the result line to look something
402 // like this.
403 // Downloaded 12 of 12 files (8 warnings, 0 errors).
404 // The warnings would be something like, 'File already downloaded'
405 // but the total number of files and the file successfully
406 // downloaded will be correct.
407 if(previous_state == DownloadJob.COMPLETE) {
408 progress.mirrorBegun(true, false);
409 }
410 else {
411 progress.mirrorBegun(false, false);
412 }
413
414 // Parse arguments into array.
415 args.add(Utility.BASE_DIR + "wget");
416 args.add("-d");
417 args.add("-o");
418 args.add("debug.txt");
419
420 if(destination != null) {
421 args.add("-P");
422 args.add(destination);
423 }
424
425 if(depth < 0) {
426 // Infinite recursion
427 args.add("-r");
428 }
429 else if (depth == 0) {
430 // Just this page.
431 }
432 else if (depth > 0) {
433 // Recursion to the specified depth.
434 args.add("-r");
435 args.add("-l");
436 args.add("" + depth + ""); // Hacky
437 }
438
439 if(previous_state == PAUSED) {
440 args.add("-nc");
441 args.add("-c");
442 }
443
444 if(proxy_user != null) {
445 args.add("--proxy-user=" + proxy_user);
446 args.add("--proxy-passwd=" + proxy_pass);
447 }
448
449 if(page_requisites) {
450 args.add("-p");
451 }
452
453 if(quiet) {
454 args.add("-q");
455 }
456
457 if(other_hosts) {
458 args.add("-H");
459 }
460
461 args.add(initial.toString());
462
463 DebugStream.println("Calling wget ");
464 for(Enumeration e = args.elements(); e.hasMoreElements();) {
465 DebugStream.println(e.nextElement() + " ");
466 }
467 DebugStream.println("");
468
469 // Run home to mummy.
470 int value = mummy.wget(args.size(), args.toArray(), debug);
471
472 // If we've got to here and the state isn't STOPPED then the job is complete.
473 if(state == RUNNING) {
474 progress.mirrorComplete();
475 previous_state = state;
476 state = COMPLETE;
477 }
478 }
479
480 /** Called by the WGet native code when the current download is
481 * completed. In turn all download listeners are informed.
482 */
483 public void downloadComplete() {
484 progress.downloadComplete();
485 url = null;
486 current_url = null;
487 }
488
489
490 public void downloadComplete(String current_file_downloading)
491 {
492 progress.downloadComplete();
493 DebugStream.println("Download complete: " + current_file_downloading);
494 }
495
496
497 /** Called by the WGet native code when the requested download returns
498 * a status code other than 200.
499 */
500 public void downloadFailed() {
501 failed_urls.add(current_url); // It is the current url that failed
502 progress.downloadFailed();
503 DebugStream.println("Download failed: " + current_url);
504 }
505
506 /**
507 */
508 public void downloadWarning() {
509 progress.downloadWarning();
510 }
511
512
513 /**
514 * @return A String representing the initial urls host (root node
515 * of tree that we are mirroring).
516 */
517 public String getHost() {
518 return url.getHost();
519 }
520
521 public AppendLineOnlyFileDocument getLogDocument() {
522 return download_log;
523 }
524
525 /**
526 * @return Returns the progress bar associated with this job.
527 */
528 public DownloadProgressBar getProgressBar() {
529 return progress;
530 }
531
532 /** Called to discover if the user wanted this thread to run or if
533 * it is paused.
534 * @return An int representing the current DownloadJob state.
535 */
536 public int getState() {
537 return state;
538 }
539
540 /** Returns the current state of the stop flag for this job.
541 * @return A boolean representing whether the user has requested to
542 * stop.
543 */
544 public boolean hasSignalledStop() {
545 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
546 state == DownloadJob.COMPLETE) {
547 return true;
548 }
549 return false;
550 }
551
552 public void setState(int state) {
553 previous_state = this.state;
554 this.state = state;
555 }
556
557 /** A convinence call.
558 * @return A String representing the url of the initial url (root node of the mirrored tree).
559 */
560 public String toString() {
561 return initial.toString();
562 }
563
564 /** Called by the WGet native code to signal the current progress of
565 * downloading.
566 * @param current A long representing the number of bytes that have
567 * been downloaded since the last update.
568 * @param expected A long representing the total number of bytes
569 * expected for this download.
570 */
571 public void updateProgress(long current, long expected) {
572 progress.updateProgress(current, expected);
573 }
574}
Note: See TracBrowser for help on using the repository browser.