source: gli/trunk/src/org/greenstone/gatherer/download/DownloadJob.java@ 20924

Last change on this file since 20924 was 20924, checked in by oranfry, 14 years ago

Since we can now have a Perl installation inside Greenstone on linux as well, GLI code for Linux also launches perl with the -S flag.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234
235 // the following also works for client-gli if downloading is enabled (when there's a gs2build directory inside gli)
236 command_list.add(Configuration.perl_path);
237 command_list.add("-S");
238 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
239 command_list.add("-download_mode");
240 command_list.add(mode);
241 command_list.add("-cache_dir");
242 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
243 // For the purposes of prematurely terminating wget from GLI (which creates a socket
244 // as a communication channel between GLI and Perl), it is important to tell the script
245 // that we're running as GLI. Because when running from the command prompt, it should
246 // not create this socket and do the related processing.
247 command_list.add("-gli");
248
249 ArrayList all_arg = download.getArguments(true,false);
250 for(int i = 0; i < all_arg.size(); i++) {
251 Argument argument = (Argument) all_arg.get(i);
252 if(argument.isAssigned()) {
253 command_list.add("-" + argument.getName());
254 if(argument.getType() != Argument.FLAG) {
255 command_list.add(argument.getValue());
256 }
257 }
258 }
259
260 String [] cmd = (String []) command_list.toArray(new String[0]);
261 DebugStream.println("Download job, "+command_list);
262
263 if (previous_state == DownloadJob.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 try {
271 Runtime rt = Runtime.getRuntime();
272
273 String [] env = null;
274
275 Process prcs = null;
276
277
278 if (Utility.isWindows()) {
279 prcs = rt.exec(cmd);
280 }
281 else {
282 if (proxy_url != null && !proxy_url.equals("")) {
283 // Specify proxies as environment variables
284 // Need to manually specify GSDLHOME and GSDLOS also
285 env = new String[4];
286 proxy_url = proxy_url.replaceAll("http://","");
287 env[0] = "http_proxy=http://"+proxy_url;
288 env[1] = "ftp_proxy=ftp://"+proxy_url;
289 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
290 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
291 prcs = rt.exec(cmd, env);
292 }
293 else if(Gatherer.isGsdlRemote && Gatherer.isDownloadEnabled) {
294 // Not Windows, but running client with download panel
295 // Need to manually specify GSDLHOME and GSDLOS
296 env = new String[2];
297 env[0] = "GSDLHOME=" + Configuration.gsdl_path;
298 env[1] = "GSDLOS=" + Gatherer.client_operating_system;
299 prcs = rt.exec(cmd, env);
300 }
301 else {
302 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
303 prcs = rt.exec(cmd);
304 }
305 }
306 //System.out.println(newcmd);
307
308 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
309 //(new PerlReaderThread(prcs)).start();
310
311 InputStream is = prcs.getInputStream();
312 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
313
314 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
315 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
316
317 // Need to find an available (unused) port within the range we're looking for to pass it
318 // the Perl child process, so that it may set up a listening ServerSocket at that port number
319 try {
320 boolean foundFreePort = false;
321 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
322
323 if(isPortAvailable(nextFreePort)) {
324 foundFreePort = true;
325 break;
326
327 } else {
328 incrementNextFreePort();
329 }
330 }
331
332 if(foundFreePort) {
333 // Free port number currently found becomes the port number of the socket that this
334 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
335 this.port = nextFreePort;
336 incrementNextFreePort();
337
338 } else {
339 throw new Exception("Cannot find an available port in the range "
340 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
341 + "\nwhich is necessary for forcibly terminating wget.");
342 }
343
344 // Communicate the chosen port for this DownloadJob instance to the perl process, so
345 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
346 OutputStream os = prcs.getOutputStream();
347 String p = ""+this.port+"\n";
348 System.err.println("Portnumber found: " + p);
349
350 os.write(p.getBytes());
351 os.close();
352
353 } catch(Exception ex) {
354 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
355 }
356 }
357
358 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
359 // Capture the standard error stream and search for two particular occurrences.
360 String line="";
361 boolean ignore_for_robots = false;
362 int max_download = DownloadJob.UNKNOWN_MAX;
363
364 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
365 if ( max_download == DownloadJob.UNKNOWN_MAX) {
366 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
367 max_download = DownloadJob.DEFINED_MAX;
368 }
369 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
370 max_download = DownloadJob.UNDEFINED_MAX;
371 }
372 }
373 else if(max_download == DownloadJob.UNDEFINED_MAX) {
374 DebugStream.println(line);
375 download_log.appendLine(line);
376 // The first magic special test is to see if we've just
377 // asked for the robots.txt file. If so we ignore
378 // the next add and then the next complete/error.
379 if(line.lastIndexOf("robots.txt;") != -1) {
380 DebugStream.println("***** Requesting robot.txt");
381 ignore_for_robots = true;
382 }
383 // If line contains "=> `" display text as the
384 // currently downloading url. Unique to add download.
385 else if(line.lastIndexOf("=> `") != -1) {
386 if(!ignore_for_robots) {
387 // Add download
388 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
389 //addDownload("http:/" + new_url);
390 }
391 }
392 // If line contains "/s) - `" set currently
393 // downloading url to "Download Complete".
394 else if(line.lastIndexOf("/s) - `") != -1) {
395 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
396 if(!ignore_for_robots) {
397 DebugStream.println("Not ignore for robots");
398 // Download complete
399 downloadComplete(current_file_downloading);
400 }
401 else {
402 DebugStream.println("Ignore for robots");
403 ignore_for_robots = false;
404 }
405 }
406 // The already there line begins "File `..." However this
407 // is only true in english, so instead I looked and there
408 // are few (if any at all) other messages than those above
409 // and not overwriting messages that use " `" so we'll
410 // look for that. Note this method is not guarenteed to be
411 // unique like the previous two.
412 else if(line.lastIndexOf(" `") != -1) {
413 // Not Overwriting
414 DebugStream.println("Already there.");
415 String new_url =
416 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
417 //addDownload("http:/" + new_url);
418 downloadWarning();
419 }
420 // Any other important message starts with the time in the form hh:mm:ss
421 else if(line.length() > 7) {
422 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
423 if(!ignore_for_robots) {
424 DebugStream.println("Error.");
425 downloadFailed();
426 }
427 else {
428 ignore_for_robots = false;
429 }
430 }
431 }
432 }
433 else if (max_download == DownloadJob.DEFINED_MAX) {
434 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
435 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
436 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
437 progress.resetFileCount();
438 }
439 else if (line.lastIndexOf("<<Done>>") != -1) {
440 progress.increaseFileCount();
441 }
442 else if(line.lastIndexOf("<<Done:") != -1) {
443 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
444 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
445 }
446
447 DebugStream.println(line);
448 download_log.appendLine(line);
449 }
450 else {
451 System.out.println("Error!!");
452 System.exit(-1);
453 }
454 }
455
456 if(state == STOPPED) {
457 boolean terminatePerlScript = true;
458
459 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
460 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
461 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
462 // with the perl script in order to get wget to terminate. Other download modes, including
463 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
464 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
465 // the currently running wget will finish off but other wgets are no longer launched.
466 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
467
468 // create a socket to the perl child process and communicate the STOP message
469 Socket clientSocket = null;
470 if(clientSocket == null) {
471 try {
472 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
473
474 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
475 String response = clientReader.readLine(); // see if we've been connected
476 System.err.println("Communicating with perl download script on port " + this.port
477 + "\nGot response from perl: " + response);
478
479 // Send the STOP signal
480 OutputStream os = clientSocket.getOutputStream();
481 String message = "<<STOP>>\n";
482 os.write(message.getBytes());
483 response = clientReader.readLine(); // see whether the stop signal has been received
484 System.err.println("GLI sent STOP signal to perl to terminate wget."
485 + "\nGot response from perl: " + response);
486
487 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
488 System.err.println("Got another response from perl: " + response);
489 os.close();
490
491 clientReader.close();
492 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
493 clientReader = null;
494 clientSocket = null;
495
496 if(response == null) {
497 terminatePerlScript = false;
498 }
499 } catch(IOException ex) {
500 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
501 } catch(Exception ex) {
502 System.err.println("Tried to open client socket, but got exception: " + ex);
503 }
504 }
505 }
506
507 //prcs.getInputStream().close();
508 prcs.getErrorStream().close();
509 br.close();
510 br = null;
511 if(terminatePerlScript) {
512 prcs.destroy(); // This doesn't always work, but it's worth a try
513 prcs = null;
514 }
515
516 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
517 synchronized(this) {
518 this.notify();
519 }
520 }
521 }
522 catch (Exception ioe) {
523 //message(Utility.ERROR, ioe.toString());
524 //JTest
525 DebugStream.printStackTrace(ioe);
526 }
527 // If we've got to here and the state isn't STOPPED then the
528 // job is complete.
529 if(state == DownloadJob.RUNNING) {
530 progress.mirrorComplete();
531 previous_state = state;
532 state = DownloadJob.COMPLETE;
533 }
534 // refresh the workspace tree
535 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
536 }
537
538
539 /** Called by the WGet native code when the current download is
540 * completed. In turn all download listeners are informed.
541 */
542 public void downloadComplete() {
543 progress.downloadComplete();
544 }
545
546
547 public void downloadComplete(String current_file_downloading)
548 {
549 progress.downloadComplete();
550 DebugStream.println("Download complete: " + current_file_downloading);
551 }
552
553
554 /** Called by the WGet native code when the requested download returns
555 * a status code other than 200.
556 */
557 public void downloadFailed() {
558 // TODO!!
559 //failed_urls.add(current_url); // It is the current url that failed
560 progress.downloadFailed();
561 //DebugStream.println("Download failed: " + current_url);
562 }
563
564 /**
565 */
566 public void downloadWarning() {
567 progress.downloadWarning();
568 }
569
570 public AppendLineOnlyFileDocument getLogDocument() {
571 return download_log;
572 }
573
574 /**
575 * @return Returns the progress bar associated with this job.
576 */
577 public DownloadProgressBar getProgressBar() {
578 return progress;
579 }
580
581 /** Called to discover if the user wanted this thread to run or if
582 * it is paused.
583 * @return An int representing the current DownloadJob state.
584 */
585 public int getState() {
586 return state;
587 }
588
589 /** Returns the current state of the stop flag for this job.
590 * @return A boolean representing whether the user has requested to
591 * stop.
592 */
593 public boolean hasSignalledStop() {
594 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
595 state == DownloadJob.COMPLETE) {
596 return true;
597 }
598 return false;
599 }
600
601 public void setState(int state) {
602 previous_state = this.state;
603 this.state = state;
604 }
605
606 /** A convenience call.
607 * @return A String representing the url of the initial url (root node of the mirrored tree).
608 */
609 public String toString() {
610 return download_url;
611 }
612
613 /** Called by the WGet native code to signal the current progress of
614 * downloading.
615 * @param current A long representing the number of bytes that have
616 * been downloaded since the last update.
617 * @param expected A long representing the total number of bytes
618 * expected for this download.
619 */
620 public void updateProgress(long current, long expected) {
621 progress.updateProgress(current, expected);
622 }
623
624
625 // Inner thread class that reads from process downloadfrom.pl's errorstream
626 private class PerlReaderThread extends Thread {
627 Process prcs = null;
628
629 public PerlReaderThread(Process proc) {
630 this.prcs = proc;
631 }
632
633 public void run() {
634 try {
635 if(prcs != null) {
636 String message = null;
637 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
638 while(prcs != null && (message = eReader.readLine()) != null) {
639 if(!message.equals("\n")) {
640 System.err.println("**** Perl STDOUT: " + message);
641 }
642 }
643
644 if(prcs != null && eReader != null) {
645 eReader.close();
646 eReader = null;
647 System.err.println("**** Perl ENDed.");
648 }
649 }
650 } catch(Exception e) {
651 System.err.println("Thread - caught exception: " + e);
652 }
653 }
654 }
655}
Note: See TracBrowser for help on using the repository browser.