source: gli/trunk/src/org/greenstone/gatherer/download/DownloadJob.java@ 17548

Last change on this file since 17548 was 17548, checked in by ak19, 16 years ago

Takes the latest changes to WgetDownload.pm into account: useWget waits for wget to launch before terminating it upon receiving the STOP message from GLI and will return a message once it has finally been able to kill wget. Based on whether this message was received, DownloadJob will decide whether to kill the perl process (otherwise it may have to wait for it to terminate naturally).

  • Property svn:keywords set to Author Date Id Revision
File size: 21.4 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234 if (Utility.isWindows()) {
235 command_list.add(Configuration.perl_path);
236 command_list.add("-S");
237 }
238 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
239 command_list.add("-download_mode");
240 command_list.add(mode);
241 command_list.add("-cache_dir");
242 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
243 // For the purposes of prematurely terminating wget from GLI (which creates a socket
244 // as a communication channel between GLI and Perl), it is important to tell the script
245 // that we're running as GLI. Because when running from the command prompt, it should
246 // not create this socket and do the related processing.
247 command_list.add("-gli");
248
249 ArrayList all_arg = download.getArguments(true,false);
250 for(int i = 0; i < all_arg.size(); i++) {
251 Argument argument = (Argument) all_arg.get(i);
252 if(argument.isAssigned()) {
253 command_list.add("-" + argument.getName());
254 if(argument.getType() != Argument.FLAG) {
255 command_list.add(argument.getValue());
256 }
257 }
258 }
259
260 String [] cmd = (String []) command_list.toArray(new String[0]);
261 DebugStream.println("Download job, "+command_list);
262
263 if (previous_state == DownloadJob.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 try {
271 Runtime rt = Runtime.getRuntime();
272
273 String [] env = null;
274
275 Process prcs = null;
276
277
278 if (Utility.isWindows()) {
279 prcs = rt.exec(cmd);
280 }
281 else {
282 if (proxy_url != null && !proxy_url.equals("")) {
283 // Specify proxies as environment variables
284 // Need to manually specify GSDLHOME and GSDLOS also
285 env = new String[4];
286 proxy_url = proxy_url.replaceAll("http://","");
287 env[0] = "http_proxy=http://"+proxy_url;
288 env[1] = "ftp_proxy=ftp://"+proxy_url;
289 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
290 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
291 prcs = rt.exec(cmd, env);
292 }
293 else {
294 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
295 prcs = rt.exec(cmd);
296 }
297 }
298 //System.out.println(newcmd);
299
300 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
301 //(new PerlReaderThread(prcs)).start();
302
303 InputStream is = prcs.getInputStream();
304 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
305
306 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
307 if (!mode.equals("Z3950") && !mode.equals("SRW")) { // wget-based Download modes (OAI, MediaWiki and Web download)
308
309 // Need to find an available (unused) port within the range we're looking for to pass it
310 // the Perl child process, so that it may set up a listening ServerSocket at that port number
311 try {
312 boolean foundFreePort = false;
313 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
314
315 if(isPortAvailable(nextFreePort)) {
316 foundFreePort = true;
317 break;
318
319 } else {
320 incrementNextFreePort();
321 }
322 }
323
324 if(foundFreePort) {
325 // Free port number currently found becomes the port number of the socket that this
326 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
327 this.port = nextFreePort;
328 incrementNextFreePort();
329
330 } else {
331 throw new Exception("Cannot find an available port in the range "
332 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
333 + "\nwhich is necessary for forcibly terminating wget.");
334 }
335
336 // Communicate the chosen port for this DownloadJob instance to the perl process, so
337 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
338 OutputStream os = prcs.getOutputStream();
339 String p = ""+this.port+"\n";
340 System.err.println("Portnumber found: " + p);
341
342 os.write(p.getBytes());
343 os.close();
344
345 } catch(Exception ex) {
346 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
347 }
348 }
349
350 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
351 // Capture the standard error stream and search for two particular occurrences.
352 String line="";
353 boolean ignore_for_robots = false;
354 int max_download = DownloadJob.UNKNOWN_MAX;
355
356 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
357 if ( max_download == DownloadJob.UNKNOWN_MAX) {
358 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
359 max_download = DownloadJob.DEFINED_MAX;
360 }
361 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
362 max_download = DownloadJob.UNDEFINED_MAX;
363 }
364 }
365 else if(max_download == DownloadJob.UNDEFINED_MAX) {
366 DebugStream.println(line);
367 download_log.appendLine(line);
368 // The first magic special test is to see if we've just
369 // asked for the robots.txt file. If so we ignore
370 // the next add and then the next complete/error.
371 if(line.lastIndexOf("robots.txt;") != -1) {
372 DebugStream.println("***** Requesting robot.txt");
373 ignore_for_robots = true;
374 }
375 // If line contains "=> `" display text as the
376 // currently downloading url. Unique to add download.
377 else if(line.lastIndexOf("=> `") != -1) {
378 if(!ignore_for_robots) {
379 // Add download
380 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
381 //addDownload("http:/" + new_url);
382 }
383 }
384 // If line contains "/s) - `" set currently
385 // downloading url to "Download Complete".
386 else if(line.lastIndexOf("/s) - `") != -1) {
387 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
388 if(!ignore_for_robots) {
389 DebugStream.println("Not ignore for robots");
390 // Download complete
391 downloadComplete(current_file_downloading);
392 }
393 else {
394 DebugStream.println("Ignore for robots");
395 ignore_for_robots = false;
396 }
397 }
398 // The already there line begins "File `..." However this
399 // is only true in english, so instead I looked and there
400 // are few (if any at all) other messages than those above
401 // and not overwriting messages that use " `" so we'll
402 // look for that. Note this method is not guarenteed to be
403 // unique like the previous two.
404 else if(line.lastIndexOf(" `") != -1) {
405 // Not Overwriting
406 DebugStream.println("Already there.");
407 String new_url =
408 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
409 //addDownload("http:/" + new_url);
410 downloadWarning();
411 }
412 // Any other important message starts with the time in the form hh:mm:ss
413 else if(line.length() > 7) {
414 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
415 if(!ignore_for_robots) {
416 DebugStream.println("Error.");
417 downloadFailed();
418 }
419 else {
420 ignore_for_robots = false;
421 }
422 }
423 }
424 }
425 else if (max_download == DownloadJob.DEFINED_MAX) {
426 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
427 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
428 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
429 progress.resetFileCount();
430 }
431 else if (line.lastIndexOf("<<Done>>") != -1) {
432 progress.increaseFileCount();
433 }
434 else if(line.lastIndexOf("<<Done:") != -1) {
435 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
436 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
437 }
438
439 DebugStream.println(line);
440 download_log.appendLine(line);
441 }
442 else {
443 System.out.println("Error!!");
444 System.exit(-1);
445 }
446 }
447 if(state == STOPPED) {
448 boolean terminatePerlScript = true;
449
450 // When GLI is working with wget-based download modes (OAI, MediaWiki and Web download)
451 // and the STOP button has been pressed, wget needs to be prematurely terminated.
452 // The presence of the tmpfile will indicate to the perl script that it's time to kill wget.
453 if(prcs != null && !mode.equals("Z3950") && !mode.equals("SRW")) {
454
455 // create a socket to the perl child process and communicate the STOP message
456 Socket clientSocket = null;
457 if(clientSocket == null) {
458 try {
459 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
460
461 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
462 String response = clientReader.readLine(); // see if we've been connected
463 System.err.println("Communicating with perl download script on port " + this.port
464 + "\nGot response from perl: " + response);
465
466 // Send the STOP signal
467 OutputStream os = clientSocket.getOutputStream();
468 String message = "<<STOP>>\n";
469 os.write(message.getBytes());
470 response = clientReader.readLine(); // see whether the stop signal has been received
471 System.err.println("GLI sent STOP signal to perl to terminate wget."
472 + "\nGot response from perl: " + response);
473
474 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
475 System.err.println("Got another response from perl: " + response);
476 os.close();
477
478 clientReader.close();
479 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
480 clientReader = null;
481 clientSocket = null;
482
483 if(response == null) {
484 terminatePerlScript = false;
485 }
486 } catch(IOException ex) {
487 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
488 } catch(Exception ex) {
489 System.err.println("Tried to open client socket, but got exception: " + ex);
490 }
491 }
492 }
493
494 //prcs.getInputStream().close();
495 prcs.getErrorStream().close();
496 if(terminatePerlScript) {
497 prcs.destroy(); // This doesn't always work, but it's worth a try
498 prcs = null;
499 }
500 br.close();
501 br = null;
502
503 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
504 synchronized(this) {
505 this.notify();
506 }
507 }
508 }
509 catch (Exception ioe) {
510 //message(Utility.ERROR, ioe.toString());
511 //JTest
512 DebugStream.printStackTrace(ioe);
513 }
514 // If we've got to here and the state isn't STOPPED then the
515 // job is complete.
516 if(state == DownloadJob.RUNNING) {
517 progress.mirrorComplete();
518 previous_state = state;
519 state = DownloadJob.COMPLETE;
520 }
521 // refresh the workspace tree
522 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
523 }
524
525
526 /** Called by the WGet native code when the current download is
527 * completed. In turn all download listeners are informed.
528 */
529 public void downloadComplete() {
530 progress.downloadComplete();
531 }
532
533
534 public void downloadComplete(String current_file_downloading)
535 {
536 progress.downloadComplete();
537 DebugStream.println("Download complete: " + current_file_downloading);
538 }
539
540
541 /** Called by the WGet native code when the requested download returns
542 * a status code other than 200.
543 */
544 public void downloadFailed() {
545 // TODO!!
546 //failed_urls.add(current_url); // It is the current url that failed
547 progress.downloadFailed();
548 //DebugStream.println("Download failed: " + current_url);
549 }
550
551 /**
552 */
553 public void downloadWarning() {
554 progress.downloadWarning();
555 }
556
557 public AppendLineOnlyFileDocument getLogDocument() {
558 return download_log;
559 }
560
561 /**
562 * @return Returns the progress bar associated with this job.
563 */
564 public DownloadProgressBar getProgressBar() {
565 return progress;
566 }
567
568 /** Called to discover if the user wanted this thread to run or if
569 * it is paused.
570 * @return An int representing the current DownloadJob state.
571 */
572 public int getState() {
573 return state;
574 }
575
576 /** Returns the current state of the stop flag for this job.
577 * @return A boolean representing whether the user has requested to
578 * stop.
579 */
580 public boolean hasSignalledStop() {
581 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
582 state == DownloadJob.COMPLETE) {
583 return true;
584 }
585 return false;
586 }
587
588 public void setState(int state) {
589 previous_state = this.state;
590 this.state = state;
591 }
592
593 /** A convenience call.
594 * @return A String representing the url of the initial url (root node of the mirrored tree).
595 */
596 public String toString() {
597 return download_url;
598 }
599
600 /** Called by the WGet native code to signal the current progress of
601 * downloading.
602 * @param current A long representing the number of bytes that have
603 * been downloaded since the last update.
604 * @param expected A long representing the total number of bytes
605 * expected for this download.
606 */
607 public void updateProgress(long current, long expected) {
608 progress.updateProgress(current, expected);
609 }
610
611
612 // Inner thread class that reads from process downloadfrom.pl's errorstream
613 private class PerlReaderThread extends Thread {
614 Process prcs = null;
615
616 public PerlReaderThread(Process proc) {
617 this.prcs = proc;
618 }
619
620 public void run() {
621 try {
622 if(prcs != null) {
623 String message = null;
624 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
625 while(prcs != null && (message = eReader.readLine()) != null) {
626 System.err.println("**** Perl STDOUT: " + message);
627 }
628
629 if(prcs != null && eReader != null) {
630 eReader.close();
631 eReader = null;
632 System.err.println("**** Perl ENDed.");
633 }
634 }
635 } catch(Exception e) {
636 System.err.println("Thread - caught exception: " + e);
637 }
638 }
639 }
640}
Note: See TracBrowser for help on using the repository browser.