source: main/trunk/gli/src/org/greenstone/gatherer/download/DownloadJob.java@ 22103

Last change on this file since 22103 was 22103, checked in by ak19, 14 years ago

The Download Progress Pane used to say Download Complete almost immediately upon starting a download and while this was still in progress. This was a problem found with OAI download, but is now also fixed with the Web download mode. Tested with both.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234
235 // the following also works for client-gli if downloading is enabled (when there's a gs2build directory inside gli)
236 command_list.add(Configuration.perl_path);
237 command_list.add("-S");
238 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
239 command_list.add("-download_mode");
240 command_list.add(mode);
241 command_list.add("-cache_dir");
242 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
243 // For the purposes of prematurely terminating wget from GLI (which creates a socket
244 // as a communication channel between GLI and Perl), it is important to tell the script
245 // that we're running as GLI. Because when running from the command prompt, it should
246 // not create this socket and do the related processing.
247 command_list.add("-gli");
248
249 ArrayList all_arg = download.getArguments(true,false);
250 for(int i = 0; i < all_arg.size(); i++) {
251 Argument argument = (Argument) all_arg.get(i);
252 if(argument.isAssigned()) {
253 command_list.add("-" + argument.getName());
254 if(argument.getType() != Argument.FLAG) {
255 command_list.add(argument.getValue());
256 }
257 }
258 }
259
260 String [] cmd = (String []) command_list.toArray(new String[0]);
261 DebugStream.println("Download job, "+command_list);
262
263 if (previous_state == DownloadJob.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 try {
271 Runtime rt = Runtime.getRuntime();
272
273 String [] env = null;
274
275 Process prcs = null;
276
277
278 if (Utility.isWindows()) {
279 prcs = rt.exec(cmd);
280 }
281 else {
282 if (proxy_url != null && !proxy_url.equals("")) {
283 // Specify proxies as environment variables
284 // Need to manually specify GSDLHOME and GSDLOS also
285 env = new String[4];
286 proxy_url = proxy_url.replaceAll("http://","");
287 env[0] = "http_proxy=http://"+proxy_url;
288 env[1] = "ftp_proxy=ftp://"+proxy_url;
289 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
290 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
291 prcs = rt.exec(cmd, env);
292 }
293 else if(Gatherer.isGsdlRemote && Gatherer.isDownloadEnabled) {
294 // Not Windows, but running client with download panel
295 // Need to manually specify GSDLHOME and GSDLOS
296 env = new String[2];
297 env[0] = "GSDLHOME=" + Configuration.gsdl_path;
298 env[1] = "GSDLOS=" + Gatherer.client_operating_system;
299 prcs = rt.exec(cmd, env);
300 }
301 else {
302 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
303 prcs = rt.exec(cmd);
304 }
305 }
306 //System.out.println(newcmd);
307
308 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
309 //(new PerlReaderThread(prcs)).start();
310
311 InputStream is = prcs.getInputStream();
312 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
313
314 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
315 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
316
317 // Need to find an available (unused) port within the range we're looking for to pass it
318 // the Perl child process, so that it may set up a listening ServerSocket at that port number
319 try {
320 boolean foundFreePort = false;
321 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
322
323 if(isPortAvailable(nextFreePort)) {
324 foundFreePort = true;
325 break;
326
327 } else {
328 incrementNextFreePort();
329 }
330 }
331
332 if(foundFreePort) {
333 // Free port number currently found becomes the port number of the socket that this
334 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
335 this.port = nextFreePort;
336 incrementNextFreePort();
337
338 } else {
339 throw new Exception("Cannot find an available port in the range "
340 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
341 + "\nwhich is necessary for forcibly terminating wget.");
342 }
343
344 // Communicate the chosen port for this DownloadJob instance to the perl process, so
345 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
346 OutputStream os = prcs.getOutputStream();
347 String p = ""+this.port+"\n";
348 System.err.println("Portnumber found: " + p);
349
350 os.write(p.getBytes());
351 os.close();
352
353 } catch(Exception ex) {
354 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
355 }
356 }
357
358 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
359 // Capture the standard error stream and search for two particular occurrences.
360 String line="";
361 boolean ignore_for_robots = false;
362 int max_download = DownloadJob.UNKNOWN_MAX;
363
364 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
365 if ( max_download == DownloadJob.UNKNOWN_MAX) {
366 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
367 max_download = DownloadJob.DEFINED_MAX;
368 }
369 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
370 max_download = DownloadJob.UNDEFINED_MAX;
371 }
372 }
373 else if(max_download == DownloadJob.UNDEFINED_MAX) {
374 DebugStream.println(line);
375 download_log.appendLine(line);
376 // The first magic special test is to see if we've just
377 // asked for the robots.txt file. If so we ignore
378 // the next add and then the next complete/error.
379 if(line.lastIndexOf("robots.txt;") != -1) {
380 DebugStream.println("***** Requesting robot.txt");
381 ignore_for_robots = true;
382 }
383 // If line contains "=> `" display text as the
384 // currently downloading url. Unique to add download.
385 else if(line.lastIndexOf("=> `") != -1) {
386 if(!ignore_for_robots) {
387 // Add download
388 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
389 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
390 }
391 }
392 // If line contains "/s) - `" set currently
393 // downloading url to "Download Complete".
394 else if(line.lastIndexOf("/s) - `") != -1) {
395 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
396 if(!ignore_for_robots) {
397 DebugStream.println("Not ignore for robots");
398 // Download complete
399 downloadComplete(current_file_downloading);
400 }
401 else {
402 DebugStream.println("Ignore for robots");
403 ignore_for_robots = false;
404 }
405 }
406 // The already there line begins "File `..." However this
407 // is only true in english, so instead I looked and there
408 // are few (if any at all) other messages than those above
409 // and not overwriting messages that use " `" so we'll
410 // look for that. Note this method is not guarenteed to be
411 // unique like the previous two.
412 else if(line.lastIndexOf(" `") != -1) {
413 // Not Overwriting
414 DebugStream.println("Already there.");
415 String new_url =
416 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
417 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
418 downloadWarning();
419 }
420 // Any other important message starts with the time in the form hh:mm:ss
421 else if(line.length() > 7) {
422 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
423 if(!ignore_for_robots) {
424 DebugStream.println("Error.");
425 downloadFailed();
426 }
427 else {
428 ignore_for_robots = false;
429 }
430 }
431 }
432 }
433 else if (max_download == DownloadJob.DEFINED_MAX) {
434 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
435 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
436 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
437 progress.resetFileCount();
438 progress.addDownload("files"); // for display: "Downloading files"
439 }
440 else if (line.lastIndexOf("<<Done>>") != -1) {
441 progress.increaseFileCount();
442 }
443 else if(line.lastIndexOf("<<Done:") != -1) {
444 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
445 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
446 }
447
448 DebugStream.println(line);
449 download_log.appendLine(line);
450 }
451 else {
452 System.out.println("Error!!");
453 System.exit(-1);
454 }
455 }
456
457 if(state == STOPPED) {
458 boolean terminatePerlScript = true;
459
460 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
461 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
462 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
463 // with the perl script in order to get wget to terminate. Other download modes, including
464 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
465 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
466 // the currently running wget will finish off but other wgets are no longer launched.
467 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
468
469 // create a socket to the perl child process and communicate the STOP message
470 Socket clientSocket = null;
471 if(clientSocket == null) {
472 try {
473 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
474
475 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
476 String response = clientReader.readLine(); // see if we've been connected
477 System.err.println("Communicating with perl download script on port " + this.port
478 + "\nGot response from perl: " + response);
479
480 // Send the STOP signal
481 OutputStream os = clientSocket.getOutputStream();
482 String message = "<<STOP>>\n";
483 os.write(message.getBytes());
484 response = clientReader.readLine(); // see whether the stop signal has been received
485 System.err.println("GLI sent STOP signal to perl to terminate wget."
486 + "\nGot response from perl: " + response);
487
488 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
489 System.err.println("Got another response from perl: " + response);
490 os.close();
491
492 clientReader.close();
493 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
494 clientReader = null;
495 clientSocket = null;
496
497 if(response == null) {
498 terminatePerlScript = false;
499 }
500 } catch(IOException ex) {
501 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
502 } catch(Exception ex) {
503 System.err.println("Tried to open client socket, but got exception: " + ex);
504 }
505 }
506 }
507
508 //prcs.getInputStream().close();
509 prcs.getErrorStream().close();
510 br.close();
511 br = null;
512 if(terminatePerlScript) {
513 prcs.destroy(); // This doesn't always work, but it's worth a try
514 prcs = null;
515 }
516
517 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
518 synchronized(this) {
519 this.notify();
520 }
521 }
522 }
523 catch (Exception ioe) {
524 //message(Utility.ERROR, ioe.toString());
525 //JTest
526 DebugStream.printStackTrace(ioe);
527 }
528 // If we've got to here and the state isn't STOPPED then the
529 // job is complete.
530 if(state == DownloadJob.RUNNING) {
531 progress.mirrorComplete();
532 previous_state = state;
533 state = DownloadJob.COMPLETE;
534 }
535 // refresh the workspace tree
536 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
537 }
538
539
540 /** Called by the WGet native code when the current download is
541 * completed. In turn all download listeners are informed.
542 */
543 public void downloadComplete() {
544 progress.downloadComplete();
545 }
546
547
548 public void downloadComplete(String current_file_downloading)
549 {
550 progress.downloadComplete();
551 DebugStream.println("Download complete: " + current_file_downloading);
552 }
553
554
555 /** Called by the WGet native code when the requested download returns
556 * a status code other than 200.
557 */
558 public void downloadFailed() {
559 // TODO!!
560 //failed_urls.add(current_url); // It is the current url that failed
561 progress.downloadFailed();
562 //DebugStream.println("Download failed: " + current_url);
563 }
564
565 /**
566 */
567 public void downloadWarning() {
568 progress.downloadWarning();
569 }
570
571 public AppendLineOnlyFileDocument getLogDocument() {
572 return download_log;
573 }
574
575 /**
576 * @return Returns the progress bar associated with this job.
577 */
578 public DownloadProgressBar getProgressBar() {
579 return progress;
580 }
581
582 /** Called to discover if the user wanted this thread to run or if
583 * it is paused.
584 * @return An int representing the current DownloadJob state.
585 */
586 public int getState() {
587 return state;
588 }
589
590 /** Returns the current state of the stop flag for this job.
591 * @return A boolean representing whether the user has requested to
592 * stop.
593 */
594 public boolean hasSignalledStop() {
595 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
596 state == DownloadJob.COMPLETE) {
597 return true;
598 }
599 return false;
600 }
601
602 public void setState(int state) {
603 previous_state = this.state;
604 this.state = state;
605 }
606
607 /** A convenience call.
608 * @return A String representing the url of the initial url (root node of the mirrored tree).
609 */
610 public String toString() {
611 return download_url;
612 }
613
614 /** Called by the WGet native code to signal the current progress of
615 * downloading.
616 * @param current A long representing the number of bytes that have
617 * been downloaded since the last update.
618 * @param expected A long representing the total number of bytes
619 * expected for this download.
620 */
621 public void updateProgress(long current, long expected) {
622 progress.updateProgress(current, expected);
623 }
624
625
626 // Inner thread class that reads from process downloadfrom.pl's errorstream
627 private class PerlReaderThread extends Thread {
628 Process prcs = null;
629
630 public PerlReaderThread(Process proc) {
631 this.prcs = proc;
632 }
633
634 public void run() {
635 try {
636 if(prcs != null) {
637 String message = null;
638 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
639 while(prcs != null && (message = eReader.readLine()) != null) {
640 if(!message.equals("\n")) {
641 System.err.println("**** Perl STDOUT: " + message);
642 }
643 }
644
645 if(prcs != null && eReader != null) {
646 eReader.close();
647 eReader = null;
648 System.err.println("**** Perl ENDed.");
649 }
650 }
651 } catch(Exception e) {
652 System.err.println("Thread - caught exception: " + e);
653 }
654 }
655 }
656}
Note: See TracBrowser for help on using the repository browser.