source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 12345

Last change on this file since 12345 was 11507, checked in by shaoqun, 18 years ago

add new parameter for wget that specifies which file extenstions should be accepted

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.WGet;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.gui.DownloadProgressBar;
52import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
53import org.greenstone.gatherer.util.GURL;
54import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
55import org.greenstone.gatherer.util.Utility;
56/**
57 * @author John Thompson, Greenstone Digital Library, University of Waikato
58 * @version 2.0
59 */
60public class DownloadJob
61 implements ActionListener {
62
63 private boolean debug;
64 private boolean higher_directories;
65 private boolean no_parents;
66 private boolean other_hosts;
67 private boolean page_requisites;
68 private boolean quiet;
69
70 private AppendLineOnlyFileDocument download_log;
71
72 private DownloadProgressBar progress;
73
74 private GURL initial = null;
75 private GURL url = null;
76
77 // private TreeModel model;
78
79 private int depth;
80 private int previous_state;
81 private int state;
82
83 private String current_url;
84 private String destination;
85 private String proxy_pass;
86 private String proxy_user;
87
88 private Vector encountered_urls;
89 private Vector failed_urls;
90
91 private WGet mummy;
92
93 public static int COMPLETE = 0;
94 public static int PAUSED = 1;
95 public static int RUNNING = 2;
96 public static int STOPPED = 3;
97
98 /**
99 */
100 public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
101 // this.model = model;
102
103 String log_filename = Gatherer.getGLIUserLogDirectoryPath() + "wget" + initial.hashCode() + ".log";
104 File log_file = new File(log_filename);
105 if(log_file.exists()) {
106 log_file.delete();
107 }
108 File parent_log_file = log_file.getParentFile();
109 parent_log_file.mkdirs();
110 parent_log_file = null;
111 log_file = null;
112
113 this.debug = debug;
114 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
115 this.no_parents = no_parents;
116 this.other_hosts = other_hosts;
117 this.page_requisites = page_requisites;
118 this.quiet = quiet;
119 this.initial = new GURL(initial);
120 this.depth = depth;
121 this.destination = destination;
122 this.proxy_pass = proxy_pass;
123 this.proxy_user = proxy_user;
124 this.mummy = mummy;
125
126 progress = new DownloadProgressBar(this, initial.toString(), simple);
127
128 encountered_urls = new Vector();
129 failed_urls = new Vector();
130
131 previous_state = STOPPED;
132 state = STOPPED;
133 }
134
135 /** Depending on which button on the progress bar was pushed,
136 * this method will affect the state of the DownloadJob and perhaps make
137 * calls to wget.class if necessary.
138 * @param event The ActionEvent fired from within the DownloadProgressBar
139 * which we must respond to.
140 */
141 public void actionPerformed(ActionEvent event) {
142 // The stop_start_button is used to alternately start or stop the
143 // job. If the current state of the job is paused then this
144 // restart is logically equivelent to a resume.
145 if(event.getSource() == progress.stop_start_button) {
146 previous_state = state;
147 if (state == RUNNING) {
148 state = STOPPED;
149 } else {
150 //previous_state = state;
151 state = RUNNING;
152 mummy.resumeThread();
153 }
154 }
155 else if (event.getSource() == progress.close_button) {
156 if(state == RUNNING) {
157 previous_state = state;
158 state = STOPPED; // do we need to do anything else to stop this?
159 }
160 // else {
161 mummy.deleteDownloadJob(this);
162 // }
163 }
164 }
165
166 /** Called by the WGet native code to inform us of a new download starting.
167 * @param raw_url The url that is being downloaded, as a String.
168 */
169 public void addDownload(String raw_url) {
170 if(!encountered_urls.contains(raw_url)) {
171 encountered_urls.add(raw_url);
172 }
173 // Regardless create a new GURL
174 current_url = raw_url;
175 url = new GURL(raw_url);
176 progress.addDownload(raw_url);
177 }
178
179 /** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
180 * to decide if it already knows about this url, and if not to
181 * update its progress bar.
182 * @param raw_url The url in question as a String.
183 * @param type Whether the link is an internal or external link.
184 * @return A boolean indicating if the url was added.
185 */
186 public boolean addLink(String raw_url, int type) {
187 ///System.out.println("addLink("+url+", "+type+")");
188 if(!encountered_urls.contains(raw_url)) {
189 // Add it to the urls we've seen.
190 encountered_urls.add(raw_url);
191 // Add it the to links for the current GURL.
192
193 // Add it to the progress file count.
194 progress.increaseFileCount();
195 return true;
196 }
197 // Regardless add it to the children links of the current GURL
198 initial.addLink(raw_url);
199
200 // We've seen it before. Don't count it again.
201 return false;
202 }
203
204 public void callWGet() {
205 // Build parameter string. Note that we never clobber, and we continue if possible
206
207 // want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
208 String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
209
210 if (no_parents) {
211 command = command + "-np ";
212 }
213 if(depth < 0) {
214 // Infinite recursion
215 command = command + "-r ";
216 }
217 else if (depth == 0) {
218 // Just this page.
219 }
220 else if (depth > 0) {
221 // Recursion to the specified depth.
222 command = command + "-r -l" + depth + " ";
223 }
224
225 String proxy_url = "";
226 // Determine if we have to use a proxy.
227 if(Configuration.get("general.use_proxy", true)) {
228 String proxy_host = Configuration.getString("general.proxy_host", true);
229 String proxy_port = Configuration.getString("general.proxy_port", true);
230 // Find out whether the user has already authenticated themselves
231 String user_pass = null;
232 String address = proxy_host + ":" + proxy_port;
233 int count = 0;
234 while(count < 3 && (user_pass = (String) GAuthenticator.authentications.get(address)) == null) {
235 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
236 count++;
237 }
238 if(count >= 3) {
239 state = STOPPED;
240 return;
241 }
242 if(user_pass.indexOf("@") != -1) {
243
244 // Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
245 if (Utility.isWindows()) {
246 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
247 } else {
248 String user_name = user_pass.substring(0, user_pass.indexOf("@"));
249 String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
250 proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
251 }
252
253 }
254 else {
255 DebugStream.println("Unknown user/pass");
256 }
257 }
258
259
260
261
262 // The user can choose to mirror all of the page requisites...
263 if(page_requisites) {
264 command = command + "-p ";
265 }
266 else{ //or only HTML page (ignore associated files e.g images, stylesheets)
267 command = command + "-A " + ".html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
268 }
269
270 // Download files from other hosts
271 if(other_hosts) {
272 command = command + "-H ";
273 }
274
275 // Finally tell it the site to download.
276 command = command + initial.toString();
277
278 if(previous_state == DownloadJob.COMPLETE) {
279 progress.mirrorBegun(true, true);
280 }
281 else {
282 progress.mirrorBegun(false, true);
283 }
284
285 File dest_file = new File(destination);
286 if (!dest_file.exists()) {
287 dest_file.mkdirs();
288 }
289 // Run it
290
291 //System.out.println("***"+command);
292 try {
293 //DebugStream.println("Cmd: " + command); // don't print it out cos it may have the password in it
294 Runtime rt = Runtime.getRuntime();
295 String [] env = null;
296 if (!proxy_url.equals("")) {
297 env = new String[2];
298 env[0] = "http_proxy=http://"+proxy_url;
299 env[1] = "ftp_proxy=ftp://"+proxy_url;
300 }
301 Process prcs = rt.exec(command, env, dest_file);
302 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
303 BufferedReader br = new BufferedReader(isr);
304 // Capture the standard error stream and seach for two particular occurances.
305 String line;
306 boolean ignore_for_robots = false;
307 while ((line = br.readLine()) != null && state != STOPPED) {
308
309 DebugStream.println(line);
310 download_log.appendLine(line);
311 // The first magic special test is to see if we've just
312 // asked for the robots.txt file. If so we ignore
313 // the next add and then the next complete/error.
314 if(line.lastIndexOf("robots.txt;") != -1) {
315 DebugStream.println("***** Requesting robot.txt");
316 ignore_for_robots = true;
317 }
318 // If line contains "=> `" display text as the
319 // currently downloading url. Unique to add download.
320 else if(line.lastIndexOf("=> `") != -1) {
321 if(!ignore_for_robots) {
322 // Add download
323 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
324 addDownload("http:/" + new_url);
325 }
326 }
327 // If line contains "/s) - `" set currently
328 // downloading url to "Download Complete".
329 else if(line.lastIndexOf("/s) - `") != -1) {
330 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
331 if(!ignore_for_robots) {
332 DebugStream.println("Not ignore for robots");
333 // Download complete
334 downloadComplete(current_file_downloading);
335 }
336 else {
337 DebugStream.println("Ignore for robots");
338 ignore_for_robots = false;
339 }
340 }
341 // The already there line begins "File `..." However this
342 // is only true in english, so instead I looked and there
343 // are few (if any at all) other messages than those above
344 // and not overwriting messages that use " `" so we'll
345 // look for that. Note this method is not guarenteed to be
346 // unique like the previous two.
347 else if(line.lastIndexOf(" `") != -1) {
348 // Not Overwriting
349 DebugStream.println("Already there.");
350 String new_url =
351 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
352 addDownload("http:/" + new_url);
353 downloadWarning();
354 }
355 // Any other important message starts with the time in the form hh:mm:ss
356 else if(line.length() > 7) {
357 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
358 if(!ignore_for_robots) {
359 DebugStream.println("Error.");
360 downloadFailed();
361 }
362 else {
363 ignore_for_robots = false;
364 }
365 }
366 }
367 }
368 if(state == STOPPED) {
369 isr.close();
370 prcs.destroy(); // This doesn't always work, but it's worth a try
371 }
372 else {
373 // Now display final message based on exit value
374 prcs.waitFor();
375 }
376 }
377 catch (Exception ioe) {
378 //message(Utility.ERROR, ioe.toString());
379 DebugStream.printStackTrace(ioe);
380 }
381 // If we've got to here and the state isn't STOPPED then the
382 // job is complete.
383 if(state == DownloadJob.RUNNING) {
384 progress.mirrorComplete();
385 previous_state = state;
386 state = DownloadJob.COMPLETE;
387
388 }
389 // refresh the workspace tree
390 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
391 }
392
393
394 /** The most important part of the DownloadJob class, this method is
395 * responsible for calling the WGet native methods used to
396 * mirror the indicated url. By this stage all the variables
397 * necessary should be set and we need only build up the
398 * parameter string and make the call.
399 */
400 public void callWGetNative() {
401 Vector args = new Vector();
402
403 // Let the DownloadProgressBar know we're starting, just in case
404 // the user hasn't told us to. If this is the second time the
405 // urls downloaded and the first attempt was successful (ie
406 // the previous job was complete), then we have the case where
407 // the user is forcing us to remirror. Reset all the values etc
408 // if this is the case then reset the variables.
409 // Note that this can cause the result line to look something
410 // like this.
411 // Downloaded 12 of 12 files (8 warnings, 0 errors).
412 // The warnings would be something like, 'File already downloaded'
413 // but the total number of files and the file successfully
414 // downloaded will be correct.
415 if(previous_state == DownloadJob.COMPLETE) {
416 progress.mirrorBegun(true, false);
417 }
418 else {
419 progress.mirrorBegun(false, false);
420 }
421
422 // Parse arguments into array.
423 args.add(Configuration.getWGetPath());
424 args.add("-d");
425 args.add("-o");
426 args.add("debug.txt");
427
428 if(destination != null) {
429 args.add("-P");
430 args.add(destination);
431 }
432
433 if(depth < 0) {
434 // Infinite recursion
435 args.add("-r");
436 }
437 else if (depth == 0) {
438 // Just this page.
439 }
440 else if (depth > 0) {
441 // Recursion to the specified depth.
442 args.add("-r");
443 args.add("-l");
444 args.add("" + depth + ""); // Hacky
445 }
446
447 if(previous_state == PAUSED) {
448 args.add("-nc");
449 args.add("-c");
450 }
451
452 if(proxy_user != null) {
453 args.add("--proxy-user=" + proxy_user);
454 args.add("--proxy-passwd=" + proxy_pass);
455 }
456
457 if(page_requisites) {
458 args.add("-p");
459 }
460
461 if(quiet) {
462 args.add("-q");
463 }
464
465 if(other_hosts) {
466 args.add("-H");
467 }
468
469 args.add(initial.toString());
470
471 DebugStream.println("Calling wget ");
472 for(Enumeration e = args.elements(); e.hasMoreElements();) {
473 DebugStream.println(e.nextElement() + " ");
474 }
475 DebugStream.println("");
476
477 // Run home to mummy.
478 int value = mummy.wget(args.size(), args.toArray(), debug);
479
480 // If we've got to here and the state isn't STOPPED then the job is complete.
481 if(state == RUNNING) {
482 progress.mirrorComplete();
483 previous_state = state;
484 state = COMPLETE;
485 }
486 }
487
488 /** Called by the WGet native code when the current download is
489 * completed. In turn all download listeners are informed.
490 */
491 public void downloadComplete() {
492 progress.downloadComplete();
493 url = null;
494 current_url = null;
495 }
496
497
498 public void downloadComplete(String current_file_downloading)
499 {
500 progress.downloadComplete();
501 DebugStream.println("Download complete: " + current_file_downloading);
502 }
503
504
505 /** Called by the WGet native code when the requested download returns
506 * a status code other than 200.
507 */
508 public void downloadFailed() {
509 failed_urls.add(current_url); // It is the current url that failed
510 progress.downloadFailed();
511 DebugStream.println("Download failed: " + current_url);
512 }
513
514 /**
515 */
516 public void downloadWarning() {
517 progress.downloadWarning();
518 }
519
520
521 /**
522 * @return A String representing the initial urls host (root node
523 * of tree that we are mirroring).
524 */
525 public String getHost() {
526 return url.getHost();
527 }
528
529 public AppendLineOnlyFileDocument getLogDocument() {
530 return download_log;
531 }
532
533 /**
534 * @return Returns the progress bar associated with this job.
535 */
536 public DownloadProgressBar getProgressBar() {
537 return progress;
538 }
539
540 /** Called to discover if the user wanted this thread to run or if
541 * it is paused.
542 * @return An int representing the current DownloadJob state.
543 */
544 public int getState() {
545 return state;
546 }
547
548 /** Returns the current state of the stop flag for this job.
549 * @return A boolean representing whether the user has requested to
550 * stop.
551 */
552 public boolean hasSignalledStop() {
553 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
554 state == DownloadJob.COMPLETE) {
555 return true;
556 }
557 return false;
558 }
559
560 public void setState(int state) {
561 previous_state = this.state;
562 this.state = state;
563 }
564
565 /** A convinence call.
566 * @return A String representing the url of the initial url (root node of the mirrored tree).
567 */
568 public String toString() {
569 return initial.toString();
570 }
571
572 /** Called by the WGet native code to signal the current progress of
573 * downloading.
574 * @param current A long representing the number of bytes that have
575 * been downloaded since the last update.
576 * @param expected A long representing the total number of bytes
577 * expected for this download.
578 */
579 public void updateProgress(long current, long expected) {
580 progress.updateProgress(current, expected);
581 }
582}
Note: See TracBrowser for help on using the repository browser.