source: gli/branches/rtl-gli/src/org/greenstone/gatherer/download/DownloadJob.java@ 18359

Last change on this file since 18359 was 18359, checked in by kjdon, 15 years ago

updated the rtl-gli branch with files from trunk. Result of a merge 14807:18318

  • Property svn:keywords set to Author Date Id Revision
File size: 21.8 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Configuration;
45import org.greenstone.gatherer.DebugStream;
46import org.greenstone.gatherer.Dictionary;
47import org.greenstone.gatherer.Gatherer;
48import org.greenstone.gatherer.GAuthenticator;
49import org.greenstone.gatherer.greenstone.LocalGreenstone;
50import org.greenstone.gatherer.file.WorkspaceTree;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.Utility;
53import org.greenstone.gatherer.cdm.Argument;
54import org.greenstone.gatherer.collection.*;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class DownloadJob
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private DownloadProgressBar progress;
72
73 private int depth;
74 private int previous_state;
75 private int state;
76
77 private String download_url = "";
78
79 // private String current_url;
80 // private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static int COMPLETE = 0;
91 public static int PAUSED = 1;
92 public static int RUNNING = 2;
93 public static int STOPPED = 3;
94
95 public static int UNKNOWN_MAX = 0;
96 public static int DEFINED_MAX = 1;
97 public static int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105
106 private String mode = null;
107
108 private String proxy_url;
109
110 /**
111 */
112 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
113 URL url = null;
114 int folder_hash;
115
116 this.proxy_url = proxy_url;
117
118 download_option = downloadToHashMap(download);
119 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
120 Argument url_arg = (Argument)download_option.get((String)"url");
121 download_url = url_arg.getValue();
122
123 }
124 else {
125 Argument host_arg = (Argument)download_option.get((String)"host");
126 Argument port_arg = (Argument)download_option.get((String)"port");
127 download_url = host_arg.getValue() + ":" +port_arg.getValue();
128 }
129
130 folder_hash = download_url.hashCode();
131 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
132 File log_file = new File(log_filename);
133 if(log_file.exists()) {
134 log_file.delete();
135 }
136
137 File parent_log_file = log_file.getParentFile();
138 parent_log_file.mkdirs();
139 parent_log_file = null;
140 log_file = null;
141
142 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
143
144 this.proxy_pass = proxy_pass;
145 this.proxy_user = proxy_user;
146 this.mummy = mummy;
147 this.mode = mode;
148 this.download = download;
149
150 progress = new DownloadProgressBar(this,download_url, true);
151 encountered_urls = new Vector();
152 failed_urls = new Vector();
153
154 previous_state = STOPPED;
155 state = STOPPED;
156 }
157
158 private HashMap downloadToHashMap(Download download)
159 {
160 HashMap download_option = new HashMap();
161 ArrayList arguments = download.getArguments(true, false);
162 for(int i = 0; i < arguments.size(); i++) {
163 Argument argument = (Argument) arguments.get(i);
164 download_option.put(argument.getName(), argument);
165 }
166 return download_option;
167 }
168
169 /** Depending on which button on the progress bar was pushed,
170 * this method will affect the state of the DownloadJob and perhaps make
171 * calls to wget.class if necessary.
172 * @param event The ActionEvent fired from within the DownloadProgressBar
173 * which we must respond to.
174 */
175 public void actionPerformed(ActionEvent event) {
176 // The stop_start_button is used to alternately start or stop the
177 // job. If the current state of the job is paused then this
178 // restart is logically equivalent to a resume.
179 if(event.getSource() == progress.stop_start_button) {
180 previous_state = state;
181 if (state == RUNNING) {
182 state = STOPPED;
183 } else {
184 //previous_state = state;
185 state = RUNNING;
186 mummy.resumeThread();
187 }
188 }
189 else if (event.getSource() == progress.close_button) {
190 if(state == RUNNING) {
191 previous_state = state;
192 state = STOPPED; // do we need to do anything else to stop this?
193 }
194 mummy.deleteDownloadJob(this);
195 }
196 }
197
198 /** Given a portnumber to check, returns true if it is available
199 * (if nothing's listening there already). */
200 public static boolean isPortAvailable(int portnum) {
201 Socket tmpSocket = null;
202 try {
203 tmpSocket = new Socket("localhost", portnum);
204 tmpSocket.close();
205 return false;
206
207 } catch(ConnectException ex){
208 // "Signals that an error occurred while attempting to connect a socket
209 // to a remote address and port. Typically, the connection was refused
210 // remotely (e.g., no process is listening on the remote address/port)."
211 System.err.println("Port " + portnum + " not yet in use.");
212 tmpSocket = null;
213 return true;
214
215 } catch(Exception ex) {
216 // includes BindException "Signals that an error occurred while attempting
217 // to bind a socket to a local address and port. Typically, the port is in
218 // use, or the requested local address could not be assigned."
219 tmpSocket = null;
220 return false;
221 }
222 }
223
224 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
225 private void incrementNextFreePort() {
226 int offset = nextFreePort - PORT_BASE;
227 offset = (offset + 1) % PORT_BLOCK_SIZE;
228 nextFreePort = PORT_BASE + offset;
229 }
230
231 public void callDownload() {
232
233 ArrayList command_list = new ArrayList();
234 if (Utility.isWindows()) {
235 command_list.add(Configuration.perl_path);
236 command_list.add("-S");
237 }
238 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
239 command_list.add("-download_mode");
240 command_list.add(mode);
241 command_list.add("-cache_dir");
242 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
243 // For the purposes of prematurely terminating wget from GLI (which creates a socket
244 // as a communication channel between GLI and Perl), it is important to tell the script
245 // that we're running as GLI. Because when running from the command prompt, it should
246 // not create this socket and do the related processing.
247 command_list.add("-gli");
248
249 ArrayList all_arg = download.getArguments(true,false);
250 for(int i = 0; i < all_arg.size(); i++) {
251 Argument argument = (Argument) all_arg.get(i);
252 if(argument.isAssigned()) {
253 command_list.add("-" + argument.getName());
254 if(argument.getType() != Argument.FLAG) {
255 command_list.add(argument.getValue());
256 }
257 }
258 }
259
260 String [] cmd = (String []) command_list.toArray(new String[0]);
261 DebugStream.println("Download job, "+command_list);
262
263 if (previous_state == DownloadJob.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 try {
271 Runtime rt = Runtime.getRuntime();
272
273 String [] env = null;
274
275 Process prcs = null;
276
277
278 if (Utility.isWindows()) {
279 prcs = rt.exec(cmd);
280 }
281 else {
282 if (proxy_url != null && !proxy_url.equals("")) {
283 // Specify proxies as environment variables
284 // Need to manually specify GSDLHOME and GSDLOS also
285 env = new String[4];
286 proxy_url = proxy_url.replaceAll("http://","");
287 env[0] = "http_proxy=http://"+proxy_url;
288 env[1] = "ftp_proxy=ftp://"+proxy_url;
289 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
290 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
291 prcs = rt.exec(cmd, env);
292 }
293 else {
294 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
295 prcs = rt.exec(cmd);
296 }
297 }
298 //System.out.println(newcmd);
299
300 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
301 //(new PerlReaderThread(prcs)).start();
302
303 InputStream is = prcs.getInputStream();
304 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
305
306 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
307 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
308
309 // Need to find an available (unused) port within the range we're looking for to pass it
310 // the Perl child process, so that it may set up a listening ServerSocket at that port number
311 try {
312 boolean foundFreePort = false;
313 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
314
315 if(isPortAvailable(nextFreePort)) {
316 foundFreePort = true;
317 break;
318
319 } else {
320 incrementNextFreePort();
321 }
322 }
323
324 if(foundFreePort) {
325 // Free port number currently found becomes the port number of the socket that this
326 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
327 this.port = nextFreePort;
328 incrementNextFreePort();
329
330 } else {
331 throw new Exception("Cannot find an available port in the range "
332 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
333 + "\nwhich is necessary for forcibly terminating wget.");
334 }
335
336 // Communicate the chosen port for this DownloadJob instance to the perl process, so
337 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
338 OutputStream os = prcs.getOutputStream();
339 String p = ""+this.port+"\n";
340 System.err.println("Portnumber found: " + p);
341
342 os.write(p.getBytes());
343 os.close();
344
345 } catch(Exception ex) {
346 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
347 }
348 }
349
350 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
351 // Capture the standard error stream and search for two particular occurrences.
352 String line="";
353 boolean ignore_for_robots = false;
354 int max_download = DownloadJob.UNKNOWN_MAX;
355
356 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
357 if ( max_download == DownloadJob.UNKNOWN_MAX) {
358 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
359 max_download = DownloadJob.DEFINED_MAX;
360 }
361 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
362 max_download = DownloadJob.UNDEFINED_MAX;
363 }
364 }
365 else if(max_download == DownloadJob.UNDEFINED_MAX) {
366 DebugStream.println(line);
367 download_log.appendLine(line);
368 // The first magic special test is to see if we've just
369 // asked for the robots.txt file. If so we ignore
370 // the next add and then the next complete/error.
371 if(line.lastIndexOf("robots.txt;") != -1) {
372 DebugStream.println("***** Requesting robot.txt");
373 ignore_for_robots = true;
374 }
375 // If line contains "=> `" display text as the
376 // currently downloading url. Unique to add download.
377 else if(line.lastIndexOf("=> `") != -1) {
378 if(!ignore_for_robots) {
379 // Add download
380 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
381 //addDownload("http:/" + new_url);
382 }
383 }
384 // If line contains "/s) - `" set currently
385 // downloading url to "Download Complete".
386 else if(line.lastIndexOf("/s) - `") != -1) {
387 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
388 if(!ignore_for_robots) {
389 DebugStream.println("Not ignore for robots");
390 // Download complete
391 downloadComplete(current_file_downloading);
392 }
393 else {
394 DebugStream.println("Ignore for robots");
395 ignore_for_robots = false;
396 }
397 }
398 // The already there line begins "File `..." However this
399 // is only true in english, so instead I looked and there
400 // are few (if any at all) other messages than those above
401 // and not overwriting messages that use " `" so we'll
402 // look for that. Note this method is not guarenteed to be
403 // unique like the previous two.
404 else if(line.lastIndexOf(" `") != -1) {
405 // Not Overwriting
406 DebugStream.println("Already there.");
407 String new_url =
408 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
409 //addDownload("http:/" + new_url);
410 downloadWarning();
411 }
412 // Any other important message starts with the time in the form hh:mm:ss
413 else if(line.length() > 7) {
414 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
415 if(!ignore_for_robots) {
416 DebugStream.println("Error.");
417 downloadFailed();
418 }
419 else {
420 ignore_for_robots = false;
421 }
422 }
423 }
424 }
425 else if (max_download == DownloadJob.DEFINED_MAX) {
426 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
427 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
428 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
429 progress.resetFileCount();
430 }
431 else if (line.lastIndexOf("<<Done>>") != -1) {
432 progress.increaseFileCount();
433 }
434 else if(line.lastIndexOf("<<Done:") != -1) {
435 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
436 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
437 }
438
439 DebugStream.println(line);
440 download_log.appendLine(line);
441 }
442 else {
443 System.out.println("Error!!");
444 System.exit(-1);
445 }
446 }
447
448 if(state == STOPPED) {
449 boolean terminatePerlScript = true;
450
451 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
452 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
453 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
454 // with the perl script in order to get wget to terminate. Other download modes, including
455 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
456 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
457 // the currently running wget will finish off but other wgets are no longer launched.
458 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
459
460 // create a socket to the perl child process and communicate the STOP message
461 Socket clientSocket = null;
462 if(clientSocket == null) {
463 try {
464 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
465
466 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
467 String response = clientReader.readLine(); // see if we've been connected
468 System.err.println("Communicating with perl download script on port " + this.port
469 + "\nGot response from perl: " + response);
470
471 // Send the STOP signal
472 OutputStream os = clientSocket.getOutputStream();
473 String message = "<<STOP>>\n";
474 os.write(message.getBytes());
475 response = clientReader.readLine(); // see whether the stop signal has been received
476 System.err.println("GLI sent STOP signal to perl to terminate wget."
477 + "\nGot response from perl: " + response);
478
479 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
480 System.err.println("Got another response from perl: " + response);
481 os.close();
482
483 clientReader.close();
484 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
485 clientReader = null;
486 clientSocket = null;
487
488 if(response == null) {
489 terminatePerlScript = false;
490 }
491 } catch(IOException ex) {
492 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
493 } catch(Exception ex) {
494 System.err.println("Tried to open client socket, but got exception: " + ex);
495 }
496 }
497 }
498
499 //prcs.getInputStream().close();
500 prcs.getErrorStream().close();
501 br.close();
502 br = null;
503 if(terminatePerlScript) {
504 prcs.destroy(); // This doesn't always work, but it's worth a try
505 prcs = null;
506 }
507
508 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
509 synchronized(this) {
510 this.notify();
511 }
512 }
513 }
514 catch (Exception ioe) {
515 //message(Utility.ERROR, ioe.toString());
516 //JTest
517 DebugStream.printStackTrace(ioe);
518 }
519 // If we've got to here and the state isn't STOPPED then the
520 // job is complete.
521 if(state == DownloadJob.RUNNING) {
522 progress.mirrorComplete();
523 previous_state = state;
524 state = DownloadJob.COMPLETE;
525 }
526 // refresh the workspace tree
527 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
528 }
529
530
531 /** Called by the WGet native code when the current download is
532 * completed. In turn all download listeners are informed.
533 */
534 public void downloadComplete() {
535 progress.downloadComplete();
536 }
537
538
539 public void downloadComplete(String current_file_downloading)
540 {
541 progress.downloadComplete();
542 DebugStream.println("Download complete: " + current_file_downloading);
543 }
544
545
546 /** Called by the WGet native code when the requested download returns
547 * a status code other than 200.
548 */
549 public void downloadFailed() {
550 // TODO!!
551 //failed_urls.add(current_url); // It is the current url that failed
552 progress.downloadFailed();
553 //DebugStream.println("Download failed: " + current_url);
554 }
555
556 /**
557 */
558 public void downloadWarning() {
559 progress.downloadWarning();
560 }
561
562 public AppendLineOnlyFileDocument getLogDocument() {
563 return download_log;
564 }
565
566 /**
567 * @return Returns the progress bar associated with this job.
568 */
569 public DownloadProgressBar getProgressBar() {
570 return progress;
571 }
572
573 /** Called to discover if the user wanted this thread to run or if
574 * it is paused.
575 * @return An int representing the current DownloadJob state.
576 */
577 public int getState() {
578 return state;
579 }
580
581 /** Returns the current state of the stop flag for this job.
582 * @return A boolean representing whether the user has requested to
583 * stop.
584 */
585 public boolean hasSignalledStop() {
586 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
587 state == DownloadJob.COMPLETE) {
588 return true;
589 }
590 return false;
591 }
592
593 public void setState(int state) {
594 previous_state = this.state;
595 this.state = state;
596 }
597
598 /** A convenience call.
599 * @return A String representing the url of the initial url (root node of the mirrored tree).
600 */
601 public String toString() {
602 return download_url;
603 }
604
605 /** Called by the WGet native code to signal the current progress of
606 * downloading.
607 * @param current A long representing the number of bytes that have
608 * been downloaded since the last update.
609 * @param expected A long representing the total number of bytes
610 * expected for this download.
611 */
612 public void updateProgress(long current, long expected) {
613 progress.updateProgress(current, expected);
614 }
615
616
617 // Inner thread class that reads from process downloadfrom.pl's errorstream
618 private class PerlReaderThread extends Thread {
619 Process prcs = null;
620
621 public PerlReaderThread(Process proc) {
622 this.prcs = proc;
623 }
624
625 public void run() {
626 try {
627 if(prcs != null) {
628 String message = null;
629 BufferedReader eReader = new BufferedReader(new InputStreamReader(prcs.getInputStream()));
630 while(prcs != null && (message = eReader.readLine()) != null) {
631 if(!message.equals("\n")) {
632 System.err.println("**** Perl STDOUT: " + message);
633 }
634 }
635
636 if(prcs != null && eReader != null) {
637 eReader.close();
638 eReader = null;
639 System.err.println("**** Perl ENDed.");
640 }
641 }
642 } catch(Exception e) {
643 System.err.println("Thread - caught exception: " + e);
644 }
645 }
646 }
647}
Note: See TracBrowser for help on using the repository browser.