source: gli/trunk/src/org/greenstone/gatherer/download/DownloadJob.java@ 17665

Last change on this file since 17665 was 17665, checked in by ak19, 15 years ago

In the case of OAIDownloads, GLI and WgetDownload.pm no longer uses sockets for terminating wget. The original GS2 2.80 did a better job in terminating the multiple wgets launched, by just letting the Java code do a process.destroy() on the perl script which--although it would let the current wget download finish--would not launch further wget instances.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234 if (Utility.isWindows()) {
235 command_list.add(Configuration.perl_path);
236 command_list.add("-S");
237 }
238 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
239 command_list.add("-download_mode");
240 command_list.add(mode);
241 command_list.add("-cache_dir");
242 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
243 // For the purposes of prematurely terminating wget from GLI (which creates a socket
244 // as a communication channel between GLI and Perl), it is important to tell the script
245 // that we're running as GLI. Because when running from the command prompt, it should
246 // not create this socket and do the related processing.
247 command_list.add("-gli");
248
249 ArrayList all_arg = download.getArguments(true,false);
250 for(int i = 0; i < all_arg.size(); i++) {
251 Argument argument = (Argument) all_arg.get(i);
252 if(argument.isAssigned()) {
253 command_list.add("-" + argument.getName());
254 if(argument.getType() != Argument.FLAG) {
255 command_list.add(argument.getValue());
256 }
257 }
258 }
259
260 String [] cmd = (String []) command_list.toArray(new String[0]);
261 DebugStream.println("Download job, "+command_list);
262
263 if (previous_state == DownloadJob.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 try {
271 Runtime rt = Runtime.getRuntime();
272
273 String [] env = null;
274
275 Process prcs = null;
276
277
278 if (Utility.isWindows()) {
279 prcs = rt.exec(cmd);
280 }
281 else {
282 if (proxy_url != null && !proxy_url.equals("")) {
283 // Specify proxies as environment variables
284 // Need to manually specify GSDLHOME and GSDLOS also
285 env = new String[4];
286 proxy_url = proxy_url.replaceAll("http://","");
287 env[0] = "http_proxy=http://"+proxy_url;
288 env[1] = "ftp_proxy=ftp://"+proxy_url;
289 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
290 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
291 prcs = rt.exec(cmd, env);
292 }
293 else {
294 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
295 prcs = rt.exec(cmd);
296 }
297 }
298 //System.out.println(newcmd);
299
300 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
301 //(new PerlReaderThread(prcs)).start();
302
303 InputStream is = prcs.getInputStream();
304 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
305
306 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
307 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
308
309 // Need to find an available (unused) port within the range we're looking for to pass it
310 // the Perl child process, so that it may set up a listening ServerSocket at that port number
311 try {
312 boolean foundFreePort = false;
313 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
314
315 if(isPortAvailable(nextFreePort)) {
316 foundFreePort = true;
317 break;
318
319 } else {
320 incrementNextFreePort();
321 }
322 }
323
324 if(foundFreePort) {
325 // Free port number currently found becomes the port number of the socket that this
326 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
327 this.port = nextFreePort;
328 incrementNextFreePort();
329
330 } else {
331 throw new Exception("Cannot find an available port in the range "
332 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
333 + "\nwhich is necessary for forcibly terminating wget.");
334 }
335
336 // Communicate the chosen port for this DownloadJob instance to the perl process, so
337 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
338 OutputStream os = prcs.getOutputStream();
339 String p = ""+this.port+"\n";
340 System.err.println("Portnumber found: " + p);
341
342 os.write(p.getBytes());
343 os.close();
344
345 } catch(Exception ex) {
346 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
347 }
348 }
349
350 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
351 // Capture the standard error stream and search for two particular occurrences.
352 String line="";
353 boolean ignore_for_robots = false;
354 int max_download = DownloadJob.UNKNOWN_MAX;
355
356 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
357 if ( max_download == DownloadJob.UNKNOWN_MAX) {
358 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
359 max_download = DownloadJob.DEFINED_MAX;
360 }
361 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
362 max_download = DownloadJob.UNDEFINED_MAX;
363 }
364 }
365 else if(max_download == DownloadJob.UNDEFINED_MAX) {
366 DebugStream.println(line);
367 download_log.appendLine(line);
368 // The first magic special test is to see if we've just
369 // asked for the robots.txt file. If so we ignore
370 // the next add and then the next complete/error.
371 if(line.lastIndexOf("robots.txt;") != -1) {
372 DebugStream.println("***** Requesting robot.txt");
373 ignore_for_robots = true;
374 }
375 // If line contains "=> `" display text as the
376 // currently downloading url. Unique to add download.
377 else if(line.lastIndexOf("=> `") != -1) {
378 if(!ignore_for_robots) {
379 // Add download
380 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
381 //addDownload("http:/" + new_url);
382 }
383 }
384 // If line contains "/s) - `" set currently
385 // downloading url to "Download Complete".
386 else if(line.lastIndexOf("/s) - `") != -1) {
387 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
388 if(!ignore_for_robots) {
389 DebugStream.println("Not ignore for robots");
390 // Download complete
391 downloadComplete(current_file_downloading);
392 }
393 else {
394 DebugStream.println("Ignore for robots");
395 ignore_for_robots = false;
396 }
397 }
398 // The already there line begins "File `..." However this
399 // is only true in english, so instead I looked and there
400 // are few (if any at all) other messages than those above
401 // and not overwriting messages that use " `" so we'll
402 // look for that. Note this method is not guarenteed to be
403 // unique like the previous two.
404 else if(line.lastIndexOf(" `") != -1) {
405 // Not Overwriting
406 DebugStream.println("Already there.");
407 String new_url =
408 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
409 //addDownload("http:/" + new_url);
410 downloadWarning();
411 }
412 // Any other important message starts with the time in the form hh:mm:ss
413 else if(line.length() > 7) {
414 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
415 if(!ignore_for_robots) {
416 DebugStream.println("Error.");
417 downloadFailed();
418 }
419 else {
420 ignore_for_robots = false;
421 }
422 }
423 }
424 }
425 else if (max_download == DownloadJob.DEFINED_MAX) {
426 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
427 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
428 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
429 progress.resetFileCount();
430 }
431 else if (line.lastIndexOf("<<Done>>") != -1) {
432 progress.increaseFileCount();
433 }
434 else if(line.lastIndexOf("<<Done:") != -1) {
435 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
436 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
437 }
438
439 DebugStream.println(line);
440 download_log.appendLine(line);
441 }
442 else {
443 System.out.println("Error!!");
444 System.exit(-1);
445 }
446 }
447
448 if(state == STOPPED) {
449 boolean terminatePerlScript = true;
450
451 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
452 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
453 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
454 // with the perl script in order to get wget to terminate. Other download modes, including
455 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
456 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
457 // the currently running wget will finish off but other wgets are no longer launched.
458 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
459
460 // create a socket to the perl child process and communicate the STOP message
461 Socket clientSocket = null;
462 if(clientSocket == null) {
463 try {
464 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
465
466 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
467 String response = clientReader.readLine(); // see if we've been connected
468 System.err.println("Communicating with perl download script on port " + this.port
469 + "\nGot response from perl: " + response);
470
471 // Send the STOP signal
472 OutputStream os = clientSocket.getOutputStream();
473 String message = "<<STOP>>\n";
474 os.write(message.getBytes());
475 response = clientReader.readLine(); // see whether the stop signal has been received
476 System.err.println("GLI sent STOP signal to perl to terminate wget."
477 + "\nGot response from perl: " + response);
478
479 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
480 System.err.println("Got another response from perl: " + response);
481 os.close();
482
483 clientReader.close();
484 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
485 clientReader = null;
486 clientSocket = null;
487
488 if(response == null) {
489 terminatePerlScript = false;
490 }
491 } catch(IOException ex) {
492 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
493 } catch(Exception ex) {
494 System.err.println("Tried to open client socket, but got exception: " + ex);
495 }
496 }
497 }
498
499 //prcs.getInputStream().close();
500 prcs.getErrorStream().close();
501 br.close();
502 br = null;
503 if(terminatePerlScript) {
504 prcs.destroy(); // This doesn't always work, but it's worth a try
505 prcs = null;
506 }
507
508 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
509 synchronized(this) {
510 this.notify();
511 }
512 }
513 }
514 catch (Exception ioe) {
515 //message(Utility.ERROR, ioe.toString());
516 //JTest
517 DebugStream.printStackTrace(ioe);
518 }
519 // If we've got to here and the state isn't STOPPED then the
520 // job is complete.
521 if(state == DownloadJob.RUNNING) {
522 progress.mirrorComplete();
523 previous_state = state;
524 state = DownloadJob.COMPLETE;
525 }
526 // refresh the workspace tree
527 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
528 }
529
530
531 /** Called by the WGet native code when the current download is
532 * completed. In turn all download listeners are informed.
533 */
534 public void downloadComplete() {
535 progress.downloadComplete();
536 }
537
538
539 public void downloadComplete(String current_file_downloading)
540 {
541 progress.downloadComplete();
542 DebugStream.println("Download complete: " + current_file_downloading);
543 }
544
545
546 /** Called by the WGet native code when the requested download returns
547 * a status code other than 200.
548 */
549 public void downloadFailed() {
550 // TODO!!
551 //failed_urls.add(current_url); // It is the current url that failed
552 progress.downloadFailed();
553 //DebugStream.println("Download failed: " + current_url);
554 }
555
556 /**
557 */
558 public void downloadWarning() {
559 progress.downloadWarning();
560 }
561
562 public AppendLineOnlyFileDocument getLogDocument() {
563 return download_log;
564 }
565
566 /**
567 * @return Returns the progress bar associated with this job.
568 */
569 public DownloadProgressBar getProgressBar() {
570 return progress;
571 }
572
573 /** Called to discover if the user wanted this thread to run or if
574 * it is paused.
575 * @return An int representing the current DownloadJob state.
576 */
577 public int getState() {
578 return state;
579 }
580
581 /** Returns the current state of the stop flag for this job.
582 * @return A boolean representing whether the user has requested to
583 * stop.
584 */
585 public boolean hasSignalledStop() {
586 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
587 state == DownloadJob.COMPLETE) {
588 return true;
589 }
590 return false;
591 }
592
593 public void setState(int state) {
594 previous_state = this.state;
595 this.state = state;
596 }
597
598 /** A convenience call.
599 * @return A String representing the url of the initial url (root node of the mirrored tree).
600 */
601 public String toString() {
602 return download_url;
603 }
604
605 /** Called by the WGet native code to signal the current progress of
606 * downloading.
607 * @param current A long representing the number of bytes that have
608 * been downloaded since the last update.
609 * @param expected A long representing the total number of bytes
610 * expected for this download.
611 */
612 public void updateProgress(long current, long expected) {
613 progress.updateProgress(current, expected);
614 }
615
616
617 // Inner thread class that reads from process downloadfrom.pl's errorstream
618 private class PerlReaderThread extends Thread {
619 Process prcs = null;
620
621 public PerlReaderThread(Process proc) {
622 this.prcs = proc;
623 }
624
625 public void run() {
626 try {
627 if(prcs != null) {
628 String message = null;
629 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
630 while(prcs != null && (message = eReader.readLine()) != null) {
631 System.err.println("**** Perl STDOUT: " + message);
632 }
633
634 if(prcs != null && eReader != null) {
635 eReader.close();
636 eReader = null;
637 System.err.println("**** Perl ENDed.");
638 }
639 }
640 } catch(Exception e) {
641 System.err.println("Thread - caught exception: " + e);
642 }
643 }
644 }
645}
Note: See TracBrowser for help on using the repository browser.