source: gli/trunk/src/org/greenstone/gatherer/download/DownloadJob.java@ 19313

Last change on this file since 19313 was 19313, checked in by ak19, 15 years ago

To get the Downloading to work when using client-gli, needed to: 1. Pass in GSDLOS from the client-gli.sh script to gli. 2. GLI needs to pass GSDLHOME and GSDLOS as environment variables when executing the downloadfrom.pl as a process on Linux.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234 if (Utility.isWindows()) {
235 command_list.add(Configuration.perl_path);
236 command_list.add("-S");
237 }
238
239 // the following also works for client-gli if downloading is enabled (when there's a gs2build directory inside gli)
240 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
241 command_list.add("-download_mode");
242 command_list.add(mode);
243 command_list.add("-cache_dir");
244 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
245 // For the purposes of prematurely terminating wget from GLI (which creates a socket
246 // as a communication channel between GLI and Perl), it is important to tell the script
247 // that we're running as GLI. Because when running from the command prompt, it should
248 // not create this socket and do the related processing.
249 command_list.add("-gli");
250
251 ArrayList all_arg = download.getArguments(true,false);
252 for(int i = 0; i < all_arg.size(); i++) {
253 Argument argument = (Argument) all_arg.get(i);
254 if(argument.isAssigned()) {
255 command_list.add("-" + argument.getName());
256 if(argument.getType() != Argument.FLAG) {
257 command_list.add(argument.getValue());
258 }
259 }
260 }
261
262 String [] cmd = (String []) command_list.toArray(new String[0]);
263 DebugStream.println("Download job, "+command_list);
264
265 if (previous_state == DownloadJob.COMPLETE) {
266 progress.mirrorBegun(true, true);
267 }
268 else {
269 progress.mirrorBegun(false, true);
270 }
271
272 try {
273 Runtime rt = Runtime.getRuntime();
274
275 String [] env = null;
276
277 Process prcs = null;
278
279
280 if (Utility.isWindows()) {
281 prcs = rt.exec(cmd);
282 }
283 else {
284 if (proxy_url != null && !proxy_url.equals("")) {
285 // Specify proxies as environment variables
286 // Need to manually specify GSDLHOME and GSDLOS also
287 env = new String[4];
288 proxy_url = proxy_url.replaceAll("http://","");
289 env[0] = "http_proxy=http://"+proxy_url;
290 env[1] = "ftp_proxy=ftp://"+proxy_url;
291 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
292 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
293 prcs = rt.exec(cmd, env);
294 }
295 else if(Gatherer.isGsdlRemote && Gatherer.isDownloadEnabled) {
296 // Not Windows, but running client with download panel
297 // Need to manually specify GSDLHOME and GSDLOS
298 env = new String[2];
299 env[0] = "GSDLHOME=" + Configuration.gsdl_path;
300 env[1] = "GSDLOS=" + Gatherer.client_operating_system;
301 prcs = rt.exec(cmd, env);
302 }
303 else {
304 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
305 prcs = rt.exec(cmd);
306 }
307 }
308 //System.out.println(newcmd);
309
310 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
311 //(new PerlReaderThread(prcs)).start();
312
313 InputStream is = prcs.getInputStream();
314 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
315
316 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
317 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
318
319 // Need to find an available (unused) port within the range we're looking for to pass it
320 // the Perl child process, so that it may set up a listening ServerSocket at that port number
321 try {
322 boolean foundFreePort = false;
323 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
324
325 if(isPortAvailable(nextFreePort)) {
326 foundFreePort = true;
327 break;
328
329 } else {
330 incrementNextFreePort();
331 }
332 }
333
334 if(foundFreePort) {
335 // Free port number currently found becomes the port number of the socket that this
336 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
337 this.port = nextFreePort;
338 incrementNextFreePort();
339
340 } else {
341 throw new Exception("Cannot find an available port in the range "
342 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
343 + "\nwhich is necessary for forcibly terminating wget.");
344 }
345
346 // Communicate the chosen port for this DownloadJob instance to the perl process, so
347 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
348 OutputStream os = prcs.getOutputStream();
349 String p = ""+this.port+"\n";
350 System.err.println("Portnumber found: " + p);
351
352 os.write(p.getBytes());
353 os.close();
354
355 } catch(Exception ex) {
356 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
357 }
358 }
359
360 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
361 // Capture the standard error stream and search for two particular occurrences.
362 String line="";
363 boolean ignore_for_robots = false;
364 int max_download = DownloadJob.UNKNOWN_MAX;
365
366 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
367 if ( max_download == DownloadJob.UNKNOWN_MAX) {
368 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
369 max_download = DownloadJob.DEFINED_MAX;
370 }
371 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
372 max_download = DownloadJob.UNDEFINED_MAX;
373 }
374 }
375 else if(max_download == DownloadJob.UNDEFINED_MAX) {
376 DebugStream.println(line);
377 download_log.appendLine(line);
378 // The first magic special test is to see if we've just
379 // asked for the robots.txt file. If so we ignore
380 // the next add and then the next complete/error.
381 if(line.lastIndexOf("robots.txt;") != -1) {
382 DebugStream.println("***** Requesting robot.txt");
383 ignore_for_robots = true;
384 }
385 // If line contains "=> `" display text as the
386 // currently downloading url. Unique to add download.
387 else if(line.lastIndexOf("=> `") != -1) {
388 if(!ignore_for_robots) {
389 // Add download
390 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
391 //addDownload("http:/" + new_url);
392 }
393 }
394 // If line contains "/s) - `" set currently
395 // downloading url to "Download Complete".
396 else if(line.lastIndexOf("/s) - `") != -1) {
397 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
398 if(!ignore_for_robots) {
399 DebugStream.println("Not ignore for robots");
400 // Download complete
401 downloadComplete(current_file_downloading);
402 }
403 else {
404 DebugStream.println("Ignore for robots");
405 ignore_for_robots = false;
406 }
407 }
408 // The already there line begins "File `..." However this
409 // is only true in english, so instead I looked and there
410 // are few (if any at all) other messages than those above
411 // and not overwriting messages that use " `" so we'll
412 // look for that. Note this method is not guarenteed to be
413 // unique like the previous two.
414 else if(line.lastIndexOf(" `") != -1) {
415 // Not Overwriting
416 DebugStream.println("Already there.");
417 String new_url =
418 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
419 //addDownload("http:/" + new_url);
420 downloadWarning();
421 }
422 // Any other important message starts with the time in the form hh:mm:ss
423 else if(line.length() > 7) {
424 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
425 if(!ignore_for_robots) {
426 DebugStream.println("Error.");
427 downloadFailed();
428 }
429 else {
430 ignore_for_robots = false;
431 }
432 }
433 }
434 }
435 else if (max_download == DownloadJob.DEFINED_MAX) {
436 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
437 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
438 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
439 progress.resetFileCount();
440 }
441 else if (line.lastIndexOf("<<Done>>") != -1) {
442 progress.increaseFileCount();
443 }
444 else if(line.lastIndexOf("<<Done:") != -1) {
445 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
446 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
447 }
448
449 DebugStream.println(line);
450 download_log.appendLine(line);
451 }
452 else {
453 System.out.println("Error!!");
454 System.exit(-1);
455 }
456 }
457
458 if(state == STOPPED) {
459 boolean terminatePerlScript = true;
460
461 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
462 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
463 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
464 // with the perl script in order to get wget to terminate. Other download modes, including
465 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
466 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
467 // the currently running wget will finish off but other wgets are no longer launched.
468 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
469
470 // create a socket to the perl child process and communicate the STOP message
471 Socket clientSocket = null;
472 if(clientSocket == null) {
473 try {
474 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
475
476 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
477 String response = clientReader.readLine(); // see if we've been connected
478 System.err.println("Communicating with perl download script on port " + this.port
479 + "\nGot response from perl: " + response);
480
481 // Send the STOP signal
482 OutputStream os = clientSocket.getOutputStream();
483 String message = "<<STOP>>\n";
484 os.write(message.getBytes());
485 response = clientReader.readLine(); // see whether the stop signal has been received
486 System.err.println("GLI sent STOP signal to perl to terminate wget."
487 + "\nGot response from perl: " + response);
488
489 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
490 System.err.println("Got another response from perl: " + response);
491 os.close();
492
493 clientReader.close();
494 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
495 clientReader = null;
496 clientSocket = null;
497
498 if(response == null) {
499 terminatePerlScript = false;
500 }
501 } catch(IOException ex) {
502 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
503 } catch(Exception ex) {
504 System.err.println("Tried to open client socket, but got exception: " + ex);
505 }
506 }
507 }
508
509 //prcs.getInputStream().close();
510 prcs.getErrorStream().close();
511 br.close();
512 br = null;
513 if(terminatePerlScript) {
514 prcs.destroy(); // This doesn't always work, but it's worth a try
515 prcs = null;
516 }
517
518 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
519 synchronized(this) {
520 this.notify();
521 }
522 }
523 }
524 catch (Exception ioe) {
525 //message(Utility.ERROR, ioe.toString());
526 //JTest
527 DebugStream.printStackTrace(ioe);
528 }
529 // If we've got to here and the state isn't STOPPED then the
530 // job is complete.
531 if(state == DownloadJob.RUNNING) {
532 progress.mirrorComplete();
533 previous_state = state;
534 state = DownloadJob.COMPLETE;
535 }
536 // refresh the workspace tree
537 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
538 }
539
540
541 /** Called by the WGet native code when the current download is
542 * completed. In turn all download listeners are informed.
543 */
544 public void downloadComplete() {
545 progress.downloadComplete();
546 }
547
548
549 public void downloadComplete(String current_file_downloading)
550 {
551 progress.downloadComplete();
552 DebugStream.println("Download complete: " + current_file_downloading);
553 }
554
555
556 /** Called by the WGet native code when the requested download returns
557 * a status code other than 200.
558 */
559 public void downloadFailed() {
560 // TODO!!
561 //failed_urls.add(current_url); // It is the current url that failed
562 progress.downloadFailed();
563 //DebugStream.println("Download failed: " + current_url);
564 }
565
566 /**
567 */
568 public void downloadWarning() {
569 progress.downloadWarning();
570 }
571
572 public AppendLineOnlyFileDocument getLogDocument() {
573 return download_log;
574 }
575
576 /**
577 * @return Returns the progress bar associated with this job.
578 */
579 public DownloadProgressBar getProgressBar() {
580 return progress;
581 }
582
583 /** Called to discover if the user wanted this thread to run or if
584 * it is paused.
585 * @return An int representing the current DownloadJob state.
586 */
587 public int getState() {
588 return state;
589 }
590
591 /** Returns the current state of the stop flag for this job.
592 * @return A boolean representing whether the user has requested to
593 * stop.
594 */
595 public boolean hasSignalledStop() {
596 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
597 state == DownloadJob.COMPLETE) {
598 return true;
599 }
600 return false;
601 }
602
603 public void setState(int state) {
604 previous_state = this.state;
605 this.state = state;
606 }
607
608 /** A convenience call.
609 * @return A String representing the url of the initial url (root node of the mirrored tree).
610 */
611 public String toString() {
612 return download_url;
613 }
614
615 /** Called by the WGet native code to signal the current progress of
616 * downloading.
617 * @param current A long representing the number of bytes that have
618 * been downloaded since the last update.
619 * @param expected A long representing the total number of bytes
620 * expected for this download.
621 */
622 public void updateProgress(long current, long expected) {
623 progress.updateProgress(current, expected);
624 }
625
626
627 // Inner thread class that reads from process downloadfrom.pl's errorstream
628 private class PerlReaderThread extends Thread {
629 Process prcs = null;
630
631 public PerlReaderThread(Process proc) {
632 this.prcs = proc;
633 }
634
635 public void run() {
636 try {
637 if(prcs != null) {
638 String message = null;
639 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
640 while(prcs != null && (message = eReader.readLine()) != null) {
641 if(!message.equals("\n")) {
642 System.err.println("**** Perl STDOUT: " + message);
643 }
644 }
645
646 if(prcs != null && eReader != null) {
647 eReader.close();
648 eReader = null;
649 System.err.println("**** Perl ENDed.");
650 }
651 }
652 } catch(Exception e) {
653 System.err.println("Thread - caught exception: " + e);
654 }
655 }
656 }
657}
Note: See TracBrowser for help on using the repository browser.