source: trunk/gli/src/org/greenstone/gatherer/collection/DownloadJob.java@ 8236

Last change on this file since 8236 was 8236, checked in by mdewsnip, 20 years ago

Replaced all Gatherer.print* with DebugStream.print*.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.8 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.WGet;
49import org.greenstone.gatherer.file.FileNode;
50import org.greenstone.gatherer.file.FileSystemModel;
51import org.greenstone.gatherer.file.WorkspaceTreeModel;
52import org.greenstone.gatherer.gui.DownloadProgressBar;
53import org.greenstone.gatherer.gui.tree.WorkspaceTree;
54import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
55import org.greenstone.gatherer.util.GURL;
56import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
57import org.greenstone.gatherer.util.Utility;
58/**
59 * @author John Thompson, Greenstone Digital Library, University of Waikato
60 * @version 2.0
61 */
62public class DownloadJob
63 implements ActionListener {
64
65 private boolean debug;
66 private boolean higher_directories;
67 private boolean no_parents;
68 private boolean other_hosts;
69 private boolean page_requisites;
70 private boolean quiet;
71
72 private AppendLineOnlyFileDocument download_log;
73
74 private DownloadProgressBar progress;
75
76 private GURL initial = null;
77 private GURL url = null;
78
79 // private TreeModel model;
80
81 private int depth;
82 private int previous_state;
83 private int state;
84
85 private String current_url;
86 private String destination;
87 private String proxy_pass;
88 private String proxy_user;
89
90 private Vector encountered_urls;
91 private Vector failed_urls;
92
93 private WGet mummy;
94
95 public static int COMPLETE = 0;
96 public static int PAUSED = 1;
97 public static int RUNNING = 2;
98 public static int STOPPED = 3;
99
100 /**
101 */
102 public DownloadJob(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
103 // this.model = model;
104
105 String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
106 File log_file = new File(log_filename);
107 if(log_file.exists()) {
108 log_file.delete();
109 }
110 File parent_log_file = log_file.getParentFile();
111 parent_log_file.mkdirs();
112 parent_log_file = null;
113 log_file = null;
114
115 this.debug = debug;
116 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
117 this.no_parents = no_parents;
118 this.other_hosts = other_hosts;
119 this.page_requisites = page_requisites;
120 this.quiet = quiet;
121 this.initial = new GURL(initial);
122 this.depth = depth;
123 this.destination = destination;
124 this.proxy_pass = proxy_pass;
125 this.proxy_user = proxy_user;
126 this.mummy = mummy;
127
128 progress = new DownloadProgressBar(this, initial.toString(), simple);
129
130 encountered_urls = new Vector();
131 failed_urls = new Vector();
132
133 previous_state = STOPPED;
134 state = STOPPED;
135 }
136
137 /** Depending on which button on the progress bar was pushed,
138 * this method will affect the state of the DownloadJob and perhaps make
139 * calls to wget.class if necessary.
140 * @param event The ActionEvent fired from within the DownloadProgressBar
141 * which we must respond to.
142 */
143 public void actionPerformed(ActionEvent event) {
144 // The stop_start_button is used to alternately start or stop the
145 // job. If the current state of the job is paused then this
146 // restart is logically equivelent to a resume.
147 if(event.getSource() == progress.stop_start_button) {
148 previous_state = state;
149 if (state == RUNNING) {
150 state = STOPPED;
151 } else {
152 //previous_state = state;
153 state = RUNNING;
154 mummy.resumeThread();
155 }
156 }
157 else if (event.getSource() == progress.close_button) {
158 if(state == RUNNING) {
159 previous_state = state;
160 state = STOPPED; // do we need to do anything else to stop this?
161 }
162 // else {
163 mummy.deleteDownloadJob(this);
164 // }
165 }
166 }
167
168 /** Called by the WGet native code to inform us of a new download starting.
169 * @param raw_url The url that is being downloaded, as a String.
170 */
171 public void addDownload(String raw_url) {
172 if(!encountered_urls.contains(raw_url)) {
173 encountered_urls.add(raw_url);
174 }
175 // Regardless create a new GURL
176 current_url = raw_url;
177 url = new GURL(raw_url);
178 progress.addDownload(raw_url);
179 }
180
181 /** Used to advise the DownloadJob of a newly parsed link. Its up to DownloadJob
182 * to decide if it already knows about this url, and if not to
183 * update its progress bar.
184 * @param raw_url The url in question as a String.
185 * @param type Whether the link is an internal or external link.
186 * @return A boolean indicating if the url was added.
187 */
188 public boolean addLink(String raw_url, int type) {
189 ///ystem.out.println("addLink("+url+", "+type+")");
190 if(!encountered_urls.contains(raw_url)) {
191 // Add it to the urls we've seen.
192 encountered_urls.add(raw_url);
193 // Add it the to links for the current GURL.
194
195 // Add it to the progress file count.
196 progress.increaseFileCount();
197 return true;
198 }
199 // Regardless add it to the children links of the current GURL
200 initial.addLink(raw_url);
201
202 // We've seen it before. Don't count it again.
203 return false;
204 }
205
206 public void callWGet() {
207 // Build parameter string. Note that we never clobber, and we continue if possible
208
209 // want to always download newer files, convert non-relative links to relative, always use directories, and only try twice to get a file before giving up
210 String command = Configuration.getWGetPath() + " -N -k -x -t 2 "; // + " -nc -c ";
211
212 if (no_parents) {
213 command = command + "-np ";
214 }
215 if(depth < 0) {
216 // Infinite recursion
217 command = command + "-r ";
218 }
219 else if (depth == 0) {
220 // Just this page.
221 }
222 else if (depth > 0) {
223 // Recursion to the specified depth.
224 command = command + "-r -l" + depth + " ";
225 }
226
227 String proxy_url = "";
228 // Determine if we have to use a proxy.
229 if(Configuration.get("general.use_proxy", true)) {
230 String proxy_host = Configuration.getString("general.proxy_host", true);
231 String proxy_port = Configuration.getString("general.proxy_port", true);
232 // Find out whether the user has already authenticated themselves
233 String user_pass = null;
234 String address = proxy_host + ":" + proxy_port;
235 int count = 0;
236 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
237 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
238 count++;
239 }
240 if(count >= 3) {
241 state = STOPPED;
242 return;
243 }
244 if(user_pass.indexOf("@") != -1) {
245
246 // Write the use proxy command - we don't do this anymore, instead we set environment variables - hopefully these can't be spied on like the follwoing can (using ps) - actually the environment stuff didn't work for windows, so lets go back to this
247 if (Utility.isWindows()) {
248 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
249 } else {
250 String user_name = user_pass.substring(0, user_pass.indexOf("@"));
251 String user_pwd = user_pass.substring(user_pass.indexOf("@") + 1);
252 proxy_url = user_name+":"+user_pwd+"@"+proxy_host+":"+proxy_port+"/";
253 }
254
255 }
256 else {
257 DebugStream.println("Unknown user/pass");
258 }
259 }
260
261 // The user can choose to mirror all of the page requisites...
262 if(page_requisites) {
263 command = command + "-p ";
264 }
265
266 // Download files from other hosts
267 if(other_hosts) {
268 command = command + "-H ";
269 }
270
271 // Finally tell it the site to download.
272 command = command + initial.toString();
273
274 if(previous_state == DownloadJob.COMPLETE) {
275 progress.mirrorBegun(true, true);
276 }
277 else {
278 progress.mirrorBegun(false, true);
279 }
280
281 File dest_file = new File(destination);
282 if (!dest_file.exists()) {
283 dest_file.mkdirs();
284 }
285 // Run it
286 try {
287 //DebugStream.println("Cmd: " + command); // don't print it out cos it may have the password in it
288 Runtime rt = Runtime.getRuntime();
289 String [] env = null;
290 if (!proxy_url.equals("")) {
291 env = new String[2];
292 env[0] = "http_proxy=http://"+proxy_url;
293 env[1] = "ftp_proxy=ftp://"+proxy_url;
294 }
295 Process prcs = rt.exec(command, env, dest_file);
296 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
297 BufferedReader br = new BufferedReader(isr);
298 // Capture the standard error stream and seach for two particular occurances.
299 String line;
300 boolean ignore_for_robots = false;
301 while ((line = br.readLine()) != null && state != STOPPED) {
302
303 DebugStream.println(line);
304 download_log.appendLine(line);
305 // The first magic special test is to see if we've just
306 // asked for the robots.txt file. If so we ignore
307 // the next add and then the next complete/error.
308 if(line.lastIndexOf("robots.txt;") != -1) {
309 DebugStream.println("***** Requesting robot.txt");
310 ignore_for_robots = true;
311 }
312 // If line contains "=> `" display text as the
313 // currently downloading url. Unique to add download.
314 else if(line.lastIndexOf("=> `") != -1) {
315 if(!ignore_for_robots) {
316 // Add download
317 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
318 addDownload("http:/" + new_url);
319 }
320 }
321 // If line contains "/s) - `" set currently
322 // downloading url to "Download Complete".
323 else if(line.lastIndexOf("/s) - `") != -1) {
324 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
325 if(!ignore_for_robots) {
326 DebugStream.println("Not ignore for robots");
327 // Download complete
328 downloadComplete(current_file_downloading);
329 }
330 else {
331 DebugStream.println("Ignore for robots");
332 ignore_for_robots = false;
333 }
334 }
335 // The already there line begins "File `..." However this
336 // is only true in english, so instead I looked and there
337 // are few (if any at all) other messages than those above
338 // and not overwriting messages that use " `" so we'll
339 // look for that. Note this method is not guarenteed to be
340 // unique like the previous two.
341 else if(line.lastIndexOf(" `") != -1) {
342 // Not Overwriting
343 DebugStream.println("Already there.");
344 String new_url =
345 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
346 addDownload("http:/" + new_url);
347 downloadWarning();
348 }
349 // Any other important message starts with the time in the form hh:mm:ss
350 else if(line.length() > 7) {
351 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
352 if(!ignore_for_robots) {
353 DebugStream.println("Error.");
354 downloadFailed();
355 }
356 else {
357 ignore_for_robots = false;
358 }
359 }
360 }
361 }
362 if(state == STOPPED) {
363 isr.close();
364 prcs.destroy(); // This doesn't always work, but it's worth a try
365 }
366 else {
367 // Now display final message based on exit value
368 prcs.waitFor();
369 }
370 }
371 catch (Exception ioe) {
372 //message(Utility.ERROR, ioe.toString());
373 DebugStream.printStackTrace(ioe);
374 }
375 // If we've got to here and the state isn't STOPPED then the
376 // job is complete.
377 if(state == DownloadJob.RUNNING) {
378 progress.mirrorComplete();
379 previous_state = state;
380 state = DownloadJob.COMPLETE;
381
382 }
383 // refresh the workspace tree
384 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
385 }
386
387
388 /** The most important part of the DownloadJob class, this method is
389 * responsible for calling the WGet native methods used to
390 * mirror the indicated url. By this stage all the variables
391 * necessary should be set and we need only build up the
392 * parameter string and make the call.
393 */
394 public void callWGetNative() {
395 Vector args = new Vector();
396
397 // Let the DownloadProgressBar know we're starting, just in case
398 // the user hasn't told us to. If this is the second time the
399 // urls downloaded and the first attempt was successful (ie
400 // the previous job was complete), then we have the case where
401 // the user is forcing us to remirror. Reset all the values etc
402 // if this is the case then reset the variables.
403 // Note that this can cause the result line to look something
404 // like this.
405 // Downloaded 12 of 12 files (8 warnings, 0 errors).
406 // The warnings would be something like, 'File already downloaded'
407 // but the total number of files and the file successfully
408 // downloaded will be correct.
409 if(previous_state == DownloadJob.COMPLETE) {
410 progress.mirrorBegun(true, false);
411 }
412 else {
413 progress.mirrorBegun(false, false);
414 }
415
416 // Parse arguments into array.
417 args.add(Utility.BASE_DIR + "wget");
418 args.add("-d");
419 args.add("-o");
420 args.add("debug.txt");
421
422 if(destination != null) {
423 args.add("-P");
424 args.add(destination);
425 }
426
427 if(depth < 0) {
428 // Infinite recursion
429 args.add("-r");
430 }
431 else if (depth == 0) {
432 // Just this page.
433 }
434 else if (depth > 0) {
435 // Recursion to the specified depth.
436 args.add("-r");
437 args.add("-l");
438 args.add("" + depth + ""); // Hacky
439 }
440
441 if(previous_state == PAUSED) {
442 args.add("-nc");
443 args.add("-c");
444 }
445
446 if(proxy_user != null) {
447 args.add("--proxy-user=" + proxy_user);
448 args.add("--proxy-passwd=" + proxy_pass);
449 }
450
451 if(page_requisites) {
452 args.add("-p");
453 }
454
455 if(quiet) {
456 args.add("-q");
457 }
458
459 if(other_hosts) {
460 args.add("-H");
461 }
462
463 args.add(initial.toString());
464
465 DebugStream.println("Calling wget ");
466 for(Enumeration e = args.elements(); e.hasMoreElements();) {
467 DebugStream.println(e.nextElement() + " ");
468 }
469 DebugStream.println("");
470
471 // Run home to mummy.
472 int value = mummy.wget(args.size(), args.toArray(), debug);
473
474 // If we've got to here and the state isn't STOPPED then the job is complete.
475 if(state == RUNNING) {
476 progress.mirrorComplete();
477 previous_state = state;
478 state = COMPLETE;
479 }
480 }
481
482 /** Called by the WGet native code when the current download is
483 * completed. In turn all download listeners are informed.
484 */
485 public void downloadComplete() {
486 progress.downloadComplete();
487 url = null;
488 current_url = null;
489 }
490
491 public void downloadComplete(String current_file_downloading) {
492 progress.downloadComplete();
493 DebugStream.println("Current File: " + current_file_downloading);
494 // !! TEMPORARILY DISABLED !!
495 //WorkspaceTreeModel.refreshWebCacheMappings();
496// if(Gatherer.g_man.gather_pane.workspace_tree != null) {
497// FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel();
498// File new_file = new File(current_file_downloading);
499// File parent_file = new_file.getParentFile();
500// String download_cache = Utility.getCacheDir().getAbsolutePath();
501// ArrayList raw_path = new ArrayList();
502// while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
503// raw_path.add(0, parent_file.getName());
504// parent_file = parent_file.getParentFile();
505// }
506// download_cache = null;
507// // Add download cache name
508// /** @todo - add to dictionary */
509// raw_path.add(0, "Mirroring.Mirror_Cache");
510// // And the root node
511// raw_path.add(0, tree_model.getRoot());
512// TreePath destination_path = new TreePath(raw_path.toArray());
513// raw_path = null;
514// // Retrieve the destination node
515// FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
516// // destination_path = null;
517// //FileNode new_file_node = new FileNode(new_file);
518
519// // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
520// ///atherer.println("Ready to insert new FileNode.");
521// DebugStream.println("Model: " + tree_model);
522// DebugStream.println("Destination path: " + destination_path);
523// destination_node.unmap();
524// ///atherer.println("Destination node: " + destination_node);
525// ///atherer.println("New node: " + new_file_node);
526// //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
527
528// //new_file_node = null;
529// destination_node = null;
530// tree_model = null;
531// }
532// url = null;
533// current_url = null;
534 }
535
536 /** Called by the WGet native code when the requested download returns
537 * a status code other than 200.
538 */
539 public void downloadFailed() {
540 ///ystem.out.println("downloadFailed("+current_url+")");
541 failed_urls.add(current_url); // Its the current url thats failed.
542 progress.downloadFailed();
543 }
544
545 /**
546 */
547 public void downloadWarning() {
548 progress.downloadWarning();
549 }
550
551 /**
552 * @return A String representing the currently downloading url.
553 */
554 /* private String getCurrent() {
555 return current_url;
556 } */
557
558 /**
559 * @return A String representing the initial urls host (root node
560 * of tree that we are mirroring).
561 */
562 public String getHost() {
563 return url.getHost();
564 }
565
566 public AppendLineOnlyFileDocument getLogDocument() {
567 return download_log;
568 }
569
570 /**
571 * @return Returns the progress bar associated with this job.
572 */
573 public DownloadProgressBar getProgressBar() {
574 return progress;
575 }
576
577 /** Called to discover if the user wanted this thread to run or if
578 * it is paused.
579 * @return An int representing the current DownloadJob state.
580 */
581 public int getState() {
582 return state;
583 }
584
585 /** Returns the current state of the stop flag for this job.
586 * @return A boolean representing whether the user has requested to
587 * stop.
588 */
589 public boolean hasSignalledStop() {
590 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
591 state == DownloadJob.COMPLETE) {
592 return true;
593 }
594 return false;
595 }
596
597 public void setState(int state) {
598 previous_state = this.state;
599 this.state = state;
600 }
601
602 /** A convinence call.
603 * @return A String representing the url of the initial url (root node of the mirrored tree).
604 */
605 public String toString() {
606 return initial.toString();
607 }
608
609 /** Called by the WGet native code to signal the current progress of
610 * downloading.
611 * @param current A long representing the number of bytes that have
612 * been downloaded since the last update.
613 * @param expected A long representing the total number of bytes
614 * expected for this download.
615 */
616 public void updateProgress(long current, long expected) {
617 progress.updateProgress(current, expected);
618 }
619}
Note: See TracBrowser for help on using the repository browser.