source: main/trunk/gli/src/org/greenstone/gatherer/download/DownloadJob.java@ 31823

Last change on this file since 31823 was 31823, checked in by ak19, 7 years ago

Another bugfix to GLI's downloadjob display: on Linux, the number of files already downloaded weren't being displayed, whereas this worked on Windows. Wget seems to use backtick or single quote character to bookend names of files already downloaded. We used to only process the backtick before.

  • Property svn:keywords set to Author Date Id Revision
File size: 47.0 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.download;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import javax.swing.SwingUtilities;
45import org.greenstone.gatherer.Configuration;
46import org.greenstone.gatherer.DebugStream;
47import org.greenstone.gatherer.Dictionary;
48import org.greenstone.gatherer.Gatherer;
49import org.greenstone.gatherer.GAuthenticator;
50import org.greenstone.gatherer.greenstone.LocalGreenstone;
51import org.greenstone.gatherer.file.WorkspaceTree;
52import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
53import org.greenstone.gatherer.util.SafeProcess;
54import org.greenstone.gatherer.util.Utility;
55import org.greenstone.gatherer.cdm.Argument;
56import org.greenstone.gatherer.collection.*;
57
58/**
59 * @author John Thompson, Greenstone Digital Library, University of Waikato
60 * @version 2.0
61 * When modifying this class, bear in mind concurrency issues that could arise with
62 * SafeProcess's worker threads and where synchronization may be needed to prevent such issues.
63 */
64public class DownloadJob
65 implements ActionListener, SafeProcess.MainProcessHandler {
66
67 private AppendLineOnlyFileDocument download_log;
68
69 private DownloadProgressBar progress;
70
71 private int previous_state;
72 private int state;
73
74 private SafeProcess prcs = null;
75
76 private final String download_url;
77 private boolean wasClosed = false;
78
79 // private String current_url;
80 // private String destination;
81 private final String proxy_pass;
82 private final String proxy_user;
83
84 //private final Vector encountered_urls;
85 //private Vector failed_urls;
86 private Download download;
87 private DownloadScrollPane mummy;
88 private HashMap download_option;
89
90 public static final int COMPLETE = 0;
91 public static final int PAUSED = 1;
92 public static final int RUNNING = 2;
93 public static final int STOPPED = 3;
94
95 public static final int UNKNOWN_MAX = 0;
96 public static final int DEFINED_MAX = 1;
97 public static final int UNDEFINED_MAX = 2;
98
99 // To prematurely terminate wget, we will need to use sockets and find a free port.
100 // We will look at a limited range of ports. This range will be reused (circular buffer)
101 private static final int PORT_BASE = 50000;
102 private static final int PORT_BLOCK_SIZE = 100;
103 private static int nextFreePort = PORT_BASE; // Keep track what port numbers we have checked for availability
104 int port; // package access. The socket port number this instance of DownloadJob will use
105 // only the main thread (where DownloadJob runs) modifies port, so no synching needed
106
107 private final String mode;
108
109 private String proxy_url; // only the main thread (where DownloadJob runs) modifies this, so no synching needed
110
111 /**
112 */
113 public DownloadJob(Download download, String proxy_pass, String proxy_user, DownloadScrollPane mummy, String mode, String proxy_url) {
114 URL url = null;
115 int folder_hash;
116
117 this.proxy_url = proxy_url;
118
119 download_option = downloadToHashMap(download);
120 if (!mode.equals("Z3950") && !mode.equals("SRW")) {
121 Argument url_arg = (Argument)download_option.get((String)"url");
122 download_url = url_arg.getValue();
123
124 }
125 else {
126 Argument host_arg = (Argument)download_option.get((String)"host");
127 Argument port_arg = (Argument)download_option.get((String)"port");
128 download_url = host_arg.getValue() + ":" +port_arg.getValue();
129 }
130
131 folder_hash = download_url.hashCode();
132 String log_filename = Utility.getLogDir(null) + "download-"+ mode + folder_hash + ".log";
133 File log_file = new File(log_filename);
134 if(log_file.exists()) {
135 log_file.delete();
136 }
137
138 File parent_log_file = log_file.getParentFile();
139 parent_log_file.mkdirs();
140 parent_log_file = null;
141 log_file = null;
142
143 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
144
145 this.proxy_pass = proxy_pass;
146 this.proxy_user = proxy_user;
147 this.mummy = mummy;
148 this.mode = mode;
149 this.download = download;
150
151 progress = new DownloadProgressBar(this,download_url, true);
152 //encountered_urls = new Vector();
153 //failed_urls = new Vector();
154
155 previous_state = STOPPED;
156 state = STOPPED;
157 }
158
159 private HashMap downloadToHashMap(Download download)
160 {
161 HashMap download_option = new HashMap();
162 ArrayList arguments = download.getArguments(true, false);
163 for(int i = 0; i < arguments.size(); i++) {
164 Argument argument = (Argument) arguments.get(i);
165 download_option.put(argument.getName(), argument);
166 }
167 return download_option;
168 }
169
170 /** Depending on which button on the progress bar was pushed,
171 * this method will affect the state of the DownloadJob and perhaps make
172 * calls to wget.class if necessary.
173 * @param event The ActionEvent fired from within the DownloadProgressBar
174 * which we must respond to.
175 */
176 public void old_actionPerformed(ActionEvent event) {
177 // The stop_start_button is used to alternately start or stop the
178 // job. If the current state of the job is paused then this
179 // restart is logically equivalent to a resume.
180 if(event.getSource() == progress.stop_start_button) {
181 previous_state = state;
182 if (state == RUNNING) {
183 state = STOPPED;
184 } else {
185 //previous_state = state;
186 state = RUNNING;
187 mummy.resumeThread();
188 }
189 }
190 else if (event.getSource() == progress.close_button) {
191 if(state == RUNNING) {
192 previous_state = state;
193 state = STOPPED; // do we need to do anything else to stop this?
194 }
195 mummy.deleteDownloadJob(this);
196 }
197 }
198
199 /** Depending on which button on the progress bar was pushed,
200 * this method will affect the state of the DownloadJob and perhaps make
201 * calls to wget.class if necessary.
202 * @param event The ActionEvent fired from within the DownloadProgressBar
203 * which we must respond to.
204 * Now using synchronized methods like previous_state = getState(); instead of
205 * previous_state = state; and setState(STOPPED); instead of state = STOPPED;
206 */
207 public void actionPerformed(ActionEvent event) {
208 // The stop_start_button is used to alternately start or stop the
209 // job. If the current state of the job is paused then this
210 // restart is logically equivalent to a resume.
211 if(event.getSource() == progress.stop_start_button) {
212 previous_state = getState();
213 if (getState() == RUNNING) {
214 stopDownload(); // cancels any running SafeProcess, will set the current state to STOPPED when the time is right
215 } else {
216 setState(RUNNING);
217 mummy.resumeThread();
218 }
219 }
220 else if (event.getSource() == progress.close_button) {
221 setClosed();
222 SafeProcess.log("@@@ Progress bar close button pressed");
223 if(getState() == RUNNING) {
224 previous_state = getState();
225 stopDownload(); // cancels any running SafeProcess, will set the current state to STOPPED when the time is right
226 }
227 mummy.deleteDownloadJob(this);
228 }
229 }
230
231 /** Given a portnumber to check, returns true if it is available
232 * (if nothing's listening there already). */
233 public static boolean isPortAvailable(int portnum) {
234 Socket tmpSocket = null;
235 try {
236 tmpSocket = new Socket("localhost", portnum);
237 tmpSocket.close();
238 return false;
239
240 } catch(ConnectException ex){
241 // "Signals that an error occurred while attempting to connect a socket
242 // to a remote address and port. Typically, the connection was refused
243 // remotely (e.g., no process is listening on the remote address/port)."
244 System.err.println("Port " + portnum + " not yet in use.");
245 tmpSocket = null;
246 return true;
247
248 } catch(Exception ex) {
249 // includes BindException "Signals that an error occurred while attempting
250 // to bind a socket to a local address and port. Typically, the port is in
251 // use, or the requested local address could not be assigned."
252 tmpSocket = null;
253 return false;
254 }
255 }
256
257 /** Circular buffer. Modifies the value of nextFreePort (the buffer index). */
258 private void incrementNextFreePort() {
259 int offset = nextFreePort - PORT_BASE;
260 offset = (offset + 1) % PORT_BLOCK_SIZE;
261 nextFreePort = PORT_BASE + offset;
262 }
263
264 // If eschewing the use of SafeProcess, reactivate (by renaming) old_callDownload()
265 // and old_actionPerformed(), and DownloadScrollPane.java's old_deleteDownloadJob().
266 public void old_callDownload() {
267
268 ArrayList command_list = new ArrayList();
269
270 // the following also works for client-gli if downloading is enabled (when there's a gs2build directory inside gli)
271 command_list.add(Configuration.perl_path);
272 command_list.add("-S");
273 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
274 command_list.add("-download_mode");
275 command_list.add(mode);
276 command_list.add("-cache_dir");
277 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
278 // For the purposes of prematurely terminating wget from GLI (which creates a socket
279 // as a communication channel between GLI and Perl), it is important to tell the script
280 // that we're running as GLI. Because when running from the command prompt, it should
281 // not create this socket and do the related processing.
282 command_list.add("-gli");
283
284 ArrayList all_arg = download.getArguments(true,false);
285 for(int i = 0; i < all_arg.size(); i++) {
286 Argument argument = (Argument) all_arg.get(i);
287 if(argument.isAssigned()) {
288 command_list.add("-" + argument.getName());
289 if(argument.getType() != Argument.FLAG) {
290 command_list.add(argument.getValue());
291 }
292 }
293 }
294
295 String [] cmd = (String []) command_list.toArray(new String[0]);
296 DebugStream.println("Download job, "+command_list);
297
298 if (previous_state == DownloadJob.COMPLETE) {
299 progress.mirrorBegun(true, true);
300 }
301 else {
302 progress.mirrorBegun(false, true);
303 }
304
305 try {
306 Runtime rt = Runtime.getRuntime();
307
308 String [] env = null;
309
310 Process prcs = null;
311
312
313 if (Utility.isWindows()) {
314 prcs = rt.exec(cmd);
315 }
316 else {
317 if (proxy_url != null && !proxy_url.equals("")) {
318 // Specify proxies as environment variables
319 // Need to manually specify GSDLHOME and GSDLOS also
320 env = new String[4];
321 proxy_url = proxy_url.replaceAll("http://","");
322 env[0] = "http_proxy=http://"+proxy_url;
323 env[1] = "ftp_proxy=ftp://"+proxy_url;
324 env[2] = "GSDLHOME=" + Configuration.gsdl_path;
325 env[3] = "GSDLOS=" + Gatherer.client_operating_system;
326 prcs = rt.exec(cmd, env);
327 }
328 else if(Gatherer.isGsdlRemote && Gatherer.isDownloadEnabled) {
329 // Not Windows, but running client with download panel
330 // Need to manually specify GSDLHOME and GSDLOS
331 env = new String[2];
332 env[0] = "GSDLHOME=" + Configuration.gsdl_path;
333 env[1] = "GSDLOS=" + Gatherer.client_operating_system;
334 prcs = rt.exec(cmd, env);
335 }
336 else {
337 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
338 prcs = rt.exec(cmd);
339 }
340 }
341 //System.out.println(newcmd);
342
343 // Can use the following if debugging WgetDownload.pm - Reads debug stmts from the perl process' STDIN stream
344 //(new PerlReaderThread(prcs)).start();
345
346 InputStream is = prcs.getInputStream();
347 BufferedReader reader = new BufferedReader(new InputStreamReader(is));
348
349 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
350 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
351
352 // Need to find an available (unused) port within the range we're looking for to pass it
353 // the Perl child process, so that it may set up a listening ServerSocket at that port number
354 try {
355 boolean foundFreePort = false;
356 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
357
358 if(isPortAvailable(nextFreePort)) {
359 foundFreePort = true;
360 break;
361
362 } else {
363 incrementNextFreePort();
364 }
365 }
366
367 if(foundFreePort) {
368 // Free port number currently found becomes the port number of the socket that this
369 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
370 this.port = nextFreePort;
371 incrementNextFreePort();
372
373 } else {
374 throw new Exception("Cannot find an available port in the range "
375 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
376 + "\nwhich is necessary for forcibly terminating wget.");
377 }
378
379 // Communicate the chosen port for this DownloadJob instance to the perl process, so
380 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
381 OutputStream os = prcs.getOutputStream();
382 String p = ""+this.port+"\n";
383 System.err.println("Portnumber found: " + p);
384
385 os.write(p.getBytes());
386 os.close();
387
388 } catch(Exception ex) {
389 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
390 }
391 }
392
393 BufferedReader br = new BufferedReader(new InputStreamReader(prcs.getErrorStream()));
394 // Capture the standard error stream and search for two particular occurrences.
395 String line="";
396 boolean ignore_for_robots = false;
397 int max_download = DownloadJob.UNKNOWN_MAX;
398
399 while ((line = br.readLine()) != null && !line.trim().equals("<<Finished>>") && state != STOPPED) {
400 if ( max_download == DownloadJob.UNKNOWN_MAX) {
401 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
402 max_download = DownloadJob.DEFINED_MAX;
403 }
404 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
405 max_download = DownloadJob.UNDEFINED_MAX;
406 }
407 }
408 else if(max_download == DownloadJob.UNDEFINED_MAX) {
409 DebugStream.println(line);
410 download_log.appendLine(line);
411 // The first magic special test is to see if we've just
412 // asked for the robots.txt file. If so we ignore
413 // the next add and then the next complete/error.
414 if(line.lastIndexOf("robots.txt;") != -1) {
415 DebugStream.println("***** Requesting robot.txt");
416 ignore_for_robots = true;
417 }
418 // If line contains "=> `" display text as the
419 // currently downloading url. Unique to add download.
420 else if(line.lastIndexOf("=> `") != -1) {
421 if(!ignore_for_robots) {
422 // Add download
423 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
424 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
425 }
426 }
427 // If line contains "/s) - `" set currently
428 // downloading url to "Download Complete".
429 else if(line.lastIndexOf("/s) - `") != -1) {
430 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
431 if(!ignore_for_robots) {
432 DebugStream.println("Not ignore for robots");
433 // Download complete
434 downloadComplete(current_file_downloading);
435 }
436 else {
437 DebugStream.println("Ignore for robots");
438 ignore_for_robots = false;
439 }
440 }
441 // The already there line begins "File `..." However this
442 // is only true in english, so instead I looked and there
443 // are few (if any at all) other messages than those above
444 // and not overwriting messages that use " `" so we'll
445 // look for that. Note this method is not guarenteed to be
446 // unique like the previous two.
447 else if(line.lastIndexOf(" `") != -1) {
448 // Not Overwriting
449 DebugStream.println("Already there.");
450 String new_url =
451 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
452 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
453 downloadWarning();
454 }
455 // Any other important message starts with the time in the form hh:mm:ss
456 else if(line.length() > 7) {
457 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
458 if(!ignore_for_robots) {
459 DebugStream.println("Error.");
460 downloadFailed();
461 }
462 else {
463 ignore_for_robots = false;
464 }
465 }
466 }
467 }
468 else if (max_download == DownloadJob.DEFINED_MAX) {
469 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
470 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
471 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
472 progress.resetFileCount();
473 progress.addDownload("files"); // for display: "Downloading files"
474 }
475 else if (line.lastIndexOf("<<Done>>") != -1) {
476 progress.increaseFileCount();
477 }
478 else if(line.lastIndexOf("<<Done:") != -1) {
479 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
480 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
481 }
482
483 DebugStream.println(line);
484 download_log.appendLine(line);
485 }
486 else {
487 System.out.println("Error!!");
488 System.exit(-1);
489 }
490 }
491
492 if(state == STOPPED) {
493 boolean terminatePerlScript = true;
494
495 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
496 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
497 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
498 // with the perl script in order to get wget to terminate. Other download modes, including
499 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
500 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
501 // the currently running wget will finish off but other wgets are no longer launched.
502 if(prcs != null && (mode.equals("Web") || mode.equals("MediaWiki"))) {
503
504 // create a socket to the perl child process and communicate the STOP message
505 Socket clientSocket = null;
506 if(clientSocket == null) {
507 try {
508 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
509
510 BufferedReader clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
511 String response = clientReader.readLine(); // see if we've been connected
512 System.err.println("Communicating with perl download script on port " + this.port
513 + "\nGot response from perl: " + response);
514
515 // Send the STOP signal
516 OutputStream os = clientSocket.getOutputStream();
517 String message = "<<STOP>>\n";
518 os.write(message.getBytes());
519 response = clientReader.readLine(); // see whether the stop signal has been received
520 System.err.println("GLI sent STOP signal to perl to terminate wget."
521 + "\nGot response from perl: " + response);
522
523 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
524 System.err.println("Got another response from perl: " + response);
525 os.close();
526
527 clientReader.close();
528 clientSocket.close(); // close the clientSocket (the Perl end will close the server socket that Perl opened)
529 clientReader = null;
530 clientSocket = null;
531
532 if(response == null) {
533 terminatePerlScript = false;
534 }
535 } catch(IOException ex) {
536 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
537 } catch(Exception ex) {
538 System.err.println("Tried to open client socket, but got exception: " + ex);
539 }
540 }
541 }
542
543 //prcs.getInputStream().close();
544 prcs.getErrorStream().close();
545 br.close();
546 br = null;
547 if(terminatePerlScript) {
548 prcs.destroy(); // This doesn't always work, but it's worth a try
549 prcs = null;
550 }
551
552 // Notify the DownloadScrollPane which is waiting on this job to complete that we are ready
553 synchronized(this) {
554 this.notify();
555 }
556 }
557 }
558 catch (Exception ioe) {
559 //message(Utility.ERROR, ioe.toString());
560 //JTest
561 DebugStream.printStackTrace(ioe);
562 }
563 // If we've got to here and the state isn't STOPPED then the
564 // job is complete.
565 if(state == DownloadJob.RUNNING) {
566 progress.mirrorComplete();
567 previous_state = state;
568 state = DownloadJob.COMPLETE;
569 }
570 // refresh the workspace tree
571 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
572 }
573
574 public void callDownload() {
575
576 ArrayList command_list= new ArrayList();
577
578 // the following also works for client-gli if downloading is enabled (when there's a gs2build directory inside gli)
579 command_list.add(Configuration.perl_path);
580 command_list.add("-S");
581 command_list.add(LocalGreenstone.getBinScriptDirectoryPath()+"downloadfrom.pl");
582 command_list.add("-download_mode");
583 command_list.add(mode);
584 command_list.add("-cache_dir");
585 command_list.add(Gatherer.getGLIUserCacheDirectoryPath());
586 // For the purposes of prematurely terminating wget from GLI (which creates a socket
587 // as a communication channel between GLI and Perl), it is important to tell the script
588 // that we're running as GLI. Because when running from the command prompt, it should
589 // not create this socket and do the related processing.
590 command_list.add("-gli");
591
592 ArrayList all_arg = download.getArguments(true,false);
593 for(int i = 0; i < all_arg.size(); i++) {
594 Argument argument = (Argument) all_arg.get(i);
595 if(argument.isAssigned()) {
596 command_list.add("-" + argument.getName());
597 if(argument.getType() != Argument.FLAG) {
598 command_list.add(argument.getValue());
599 }
600 }
601 }
602
603 String [] cmd = (String []) command_list.toArray(new String[0]);
604 DebugStream.println("Download job, "+command_list);
605
606 if (previous_state == DownloadJob.COMPLETE) {
607 progress.mirrorBegun(true, true);
608 }
609 else {
610 progress.mirrorBegun(false, true);
611 }
612
613 try {
614 Runtime rt = Runtime.getRuntime();
615
616 String [] env = null;
617
618 if (Utility.isWindows()) {
619 prcs = new SafeProcess(cmd);
620 }
621 else {
622 if (proxy_url != null && !proxy_url.equals("")) {
623 // Specify proxies as environment variables
624 // Need to manually specify GSDLHOME and GSDLOS also
625 env = new String[5];
626 proxy_url = proxy_url.replaceAll("http://","");
627 env[0] = "http_proxy=http://"+proxy_url;
628 env[1] = "https_proxy=http://"+proxy_url; // HTTP protocol for https:// too
629 // see also https://wiki.archlinux.org/index.php/proxy_settings
630 env[2] = "ftp_proxy=ftp://"+proxy_url;
631 env[3] = "GSDLHOME=" + Configuration.gsdl_path;
632 env[4] = "GSDLOS=" + Gatherer.client_operating_system;
633
634 prcs = new SafeProcess(cmd, env, null);
635 }
636 else if(Gatherer.isGsdlRemote && Gatherer.isDownloadEnabled) {
637 // Not Windows, but running client with download panel
638 // Need to manually specify GSDLHOME and GSDLOS
639 env = new String[2];
640 env[0] = "GSDLHOME=" + Configuration.gsdl_path;
641 env[1] = "GSDLOS=" + Gatherer.client_operating_system;
642
643 prcs = new SafeProcess(cmd, env, null);
644 }
645 else {
646 // Will inherit the GLI's environment, with GSDLHOME and GSDLOS set
647 prcs = new SafeProcess(cmd);
648 }
649 }
650 //System.out.println(newcmd);
651 prcs.setMainHandler(this); // attach handler to clean up before and after process.destroy()
652 // for which DownloadJob implements SafeProcess.MainProcessHandler
653
654 // To be able to stop Wget, we use sockets to communicate with the perl process that launched wget
655 if (mode.equals("Web") || mode.equals("MediaWiki")) { // wget download modes other than OAI
656
657 // Need to find an available (unused) port within the range we're looking for to pass it
658 // the Perl child process, so that it may set up a listening ServerSocket at that port number
659 try {
660 boolean foundFreePort = false;
661 for(int i = 0; i < PORT_BLOCK_SIZE; i++) {
662
663 if(isPortAvailable(nextFreePort)) {
664 foundFreePort = true;
665 break;
666
667 } else {
668 incrementNextFreePort();
669 }
670 }
671
672 if(foundFreePort) {
673 // Free port number currently found becomes the port number of the socket that this
674 // DownloadJob instance will be connecting to when the user wants to prematurely stop Wget.
675 this.port = nextFreePort;
676 incrementNextFreePort(); //// Necessary?
677
678 } else {
679 throw new Exception("Cannot find an available port in the range "
680 + PORT_BASE + "-" + (PORT_BASE+PORT_BLOCK_SIZE)
681 + "\nwhich is necessary for forcibly terminating wget.");
682 }
683
684 // Communicate the chosen port for this DownloadJob instance to the perl process, so
685 // that it can set up a ServerSocket at that port to listen for any signal to terminate wget
686 //OutputStream os = prcs.getOutputStream();
687 String p = ""+this.port+"\n";
688 System.err.println("Portnumber found: " + p);
689
690 prcs.setInputString(p);
691
692 } catch(Exception ex) {
693 System.err.println("Sent available portnumber " + this.port + " to process' outputstream.\nBut got exception: " + ex);
694 }
695 }
696
697 ProcessErrHandler errHandler = new ProcessErrHandler(); // meaningful output comes from prcs stderr
698 ProcessOutHandler outHandler = new ProcessOutHandler(); // debugging output comes from prcs' stdout
699
700 int exitVal = prcs.runProcess(null, outHandler, errHandler);
701
702 // if prcs is interrupted (cancelled) during the blocking runProcess() call,
703 // as happens on state == STOPPED, then
704 // beforeWaitingForStreamsToEnd() is called before the process' worker threads come to a halt
705 // and afterStreamsEnded() is called when the process' worker threads have halted,
706 // beforeProcessDestroy() is called before the process is destroyed,
707 // and afterProcessDestroy() is called after the proc has been destroyed.
708 // If when beforeWaitingForStreamsEnd() stage the perl was still running but had been
709 // told to stop, then the beforeWaitingForStreamsEnd() method will make sure to communicate
710 // with the perl process over a socket and send it the termination message,
711 // which will also kill any runnning wget that perl launched.
712 // In that case, destroy() is actually called on the process at last.
713
714 }
715 catch (Exception ioe) {
716 SafeProcess.log(ioe);
717 DebugStream.printStackTrace(ioe);
718 }
719
720 // now the process is done, we can at last null it
721 prcs = null;
722
723 // If we've got to here and the state isn't STOPPED then the
724 // job is complete.
725 if(getState() == DownloadJob.RUNNING) {
726 progress.mirrorComplete();
727 previous_state = getState();
728 setState(DownloadJob.COMPLETE);
729 }
730
731 SafeProcess.log("@@@@ DONE callDownload()");
732
733 // refresh the workspace tree
734 Gatherer.g_man.refreshWorkspaceTree(WorkspaceTree.DOWNLOADED_FILES_CHANGED);
735 }
736
737 private synchronized boolean isStopped() { return state == STOPPED; }
738
739 // called when the user cancelled the download and we're told to stop both our external perl process
740 // and the wget process that it in turn launched
741 public void stopDownload() {
742 if(prcs != null) {
743 SafeProcess.log("@@@ Going to cancel the SafeProcess...");
744
745 // Whether a process ends naturally or is prematurely ended, beforeWaitingForStreamsToEnd()
746 // will be called. We've hooked this in to calling tellPerlToTerminateWget() only if the
747 // process is still running when cancel is pressed, but not when it's naturally terminated.
748 boolean hadToSendInterrupt = prcs.cancelRunningProcess(); // returns false if it was already terminating/terminated, true if interrupt sent
749
750 } else {
751 System.err.println("@@@@ No SafeProcess to cancel");
752 }
753
754 //setState(STOPPED); // would set it to stop on cancel, even if it already naturally terminated
755
756 }
757
758//*********** START of implementing interface Safeprocess.MainProcessHandler
759 // before and after processDestroy only happen when interrupted AND terminatePerlScript=true
760 public void beforeProcessDestroy() {}
761 public void afterProcessDestroy() {}
762
763 // after blocking call on closing up streamgobbler worker threads that happens
764 // upon natural termination or interruption of process' main body/thread.
765 // if not overriding, then return the parameter forciblyTerminating as-is
766 public boolean afterStreamsEnded(boolean forciblyTerminating) { return forciblyTerminating; }
767
768 // called after the SafeProcess has fully terminated (naturally or via process.destroy())
769 // and has been cleaned up
770 public void doneCleanup(boolean wasForciblyTerminated) {
771 // let the user know they can cancel again now cleanup phase is done
772 progress.enableCancelJob(true);
773
774 if(wasForciblyTerminated) {
775 setState(STOPPED); // sets it to stop only if process truly was prematurely terminated, not merely
776 // if the cancel button was clicked when it had already naturally terminated
777
778 // If the user had pressed the Close button to terminate the running job, then
779 // we're now ready to remove the display of the until now running job
780 // from the download progress bar interface
781 // But don't bother removing the progress bar if the user had only pressed the Stop button
782 if(wasClosed()) {
783 mummy.deleteCurrentDownloadJob(this);
784 }
785 }
786 }
787
788 // before blocking call of ending streamgobbler worker threads that happens
789 // after process' main body/thread has naturally terminated or been interrupted
790 public boolean beforeWaitingForStreamsToEnd(boolean forciblyTerminating) {
791 // let the user know they can't cancel during cleanup phase
792 progress.enableCancelJob(false);
793
794 SafeProcess.log("**** in beforeWaitingForStreamsToEnd()");
795
796 // state would not be STOPPED if cancel was pressed after the process naturally terminated anyway
797 // in that case we don't need to send perl the signal to terminate WGET
798 if(!forciblyTerminating) { //if(!isStopped()) {
799 SafeProcess.log("*** Process not (yet) cancelled/state not (yet) stopped");
800 SafeProcess.log("*** But process has naturally terminated (process streams are being closed before any interruption signal can be received), so won't be destroying process even on interrupt");
801 return false; // for us to be in this method at all with forciblyTerminating being false
802 // means the process is already naturally terminating, so don't unnaturally destroy it
803 }
804
805 // else the process is still running and we've been told to stop, so tell perl to stop wget first
806 // (so that process destroy can then be called thereafter)
807 return tellPerlToTerminateWget();
808 }
809//*********** END of implementing interface Safeprocess.MainProcessHandler
810
811 public boolean tellPerlToTerminateWget() {
812 SafeProcess.log("**** in tellPerlToTerminateWget()");
813
814 boolean terminatePerlScript = true;
815
816 // When GLI is working with wget-based download modes other than OAI (MediaWiki and Web
817 // download) and the STOP button has been pressed, wget needs to be prematurely terminated.
818 // Only wget download modes Web and MediaWiki require the use of sockets to communicate
819 // with the perl script in order to get wget to terminate. Other download modes, including
820 // wgetdownload mode OAI, can terminate in the traditional manner: close process inputstream
821 // and kill perl process. OAI launches many wgets. So that when the perl process is terminated,
822 // the currently running wget will finish off but other wgets are no longer launched.
823 if((mode.equals("Web") || mode.equals("MediaWiki"))) {
824 SafeProcess.log("@@@ Socket communication to end wget");
825 // create a socket to the perl child process and communicate the STOP message
826 Socket clientSocket = null;
827 BufferedReader clientReader = null;
828 OutputStream os = null;
829
830 if(clientSocket == null) {
831 try {
832 clientSocket = new Socket("localhost", this.port); // connect to the port chosen for this DownloadJob instance
833
834 clientReader = new BufferedReader(new InputStreamReader(clientSocket.getInputStream()));
835 String response = clientReader.readLine(); // see if we've been connected
836 System.err.println("Communicating with perl download script on port " + this.port
837 + "\nGot response from perl: " + response);
838
839 // Send the STOP signal
840 os = clientSocket.getOutputStream();
841 String message = "<<STOP>>\n";
842 os.write(message.getBytes());
843 response = clientReader.readLine(); // see whether the stop signal has been received
844 System.err.println("GLI sent STOP signal to perl to terminate wget."
845 + "\nGot response from perl: " + response);
846
847 response = clientReader.readLine(); // see whether the perl script is ready to be terminated
848 System.err.println("Got another response from perl: " + response);
849
850 if(response == null) { // why? Is it because the process has already terminated naturally if response is null?
851 terminatePerlScript = false;
852 }
853 } catch(IOException ex) {
854 if(ex instanceof IOException && ex.getMessage().indexOf("Connection refused") != -1) {
855 terminatePerlScript = false; // no socket listening on other end because process ended
856 System.err.println("Tried to communicate through client socket - port " + this.port + ", but the process seems to have already ended naturally");
857 } else {
858 System.err.println("Tried to communicate through client socket - port " + this.port + ", but got exception: " + ex);
859 }
860
861 } catch(Exception ex) {
862 System.err.println("Tried to open client socket, but got exception: " + ex);
863 } finally {
864 SafeProcess.closeResource(os);
865 SafeProcess.closeResource(clientReader);
866 SafeProcess.closeSocket(clientSocket); // close the clientSocket (the Perl end will close the server socket that Perl opened)
867 os = null;
868 clientReader = null;
869 clientSocket = null;
870 }
871 }
872 }
873
874 return terminatePerlScript; // if true, it will call destroy() on the SafeProcess' process
875 }
876
877
878 /** Called by the WGet native code when the current download is
879 * completed. In turn all download listeners are informed.
880 */
881 public void downloadComplete() {
882 progress.downloadComplete(); // now this is synchronized
883 }
884
885
886 public void downloadComplete(String current_file_downloading)
887 {
888 progress.downloadComplete(); // now this is synchronized
889 DebugStream.println("Download complete: " + current_file_downloading);
890 }
891
892
893 /** Called by the WGet native code when the requested download returns
894 * a status code other than 200.
895 */
896 public void downloadFailed() {
897 // TODO!!
898 //synchronized(failed_urls) {
899 //failed_urls.add(current_url); // It is the current url that failed
900 //}
901 progress.downloadFailed(); // now this is synchronized
902 //DebugStream.println("Download failed: " + current_url);
903 }
904
905 /**
906 */
907 public void downloadWarning() {
908 progress.downloadWarning(); // now this is synchronized
909 }
910
911 public AppendLineOnlyFileDocument getLogDocument() {
912 return download_log;
913 }
914
915 /**
916 * @return Returns the progress bar associated with this job.
917 */
918 public DownloadProgressBar getProgressBar() {
919 return progress;
920 }
921
922 /** Called to discover if the user wanted this thread to run or if
923 * it is paused.
924 * @return An int representing the current DownloadJob state.
925 */
926 public synchronized int getState() {
927 return state;
928 }
929
930 /** @return true if the close button of the DownloadProgressBar was pressed,
931 * false otherwise such as if the Stop button had been pressed.
932 */
933 private synchronized boolean wasClosed() {
934 return this.wasClosed;
935 }
936
937 /** Returns the current state of the stop flag for this job.
938 * @return A boolean representing whether the user has requested to
939 * stop.
940 */
941 public synchronized boolean hasSignalledStop() {
942 if(state == DownloadJob.STOPPED || state == DownloadJob.PAUSED ||
943 state == DownloadJob.COMPLETE) {
944 return true;
945 }
946 return false;
947 }
948
949 public synchronized void setState(int state) {
950 previous_state = this.state;
951 this.state = state;
952 }
953
954 private synchronized void setClosed() {
955 this.wasClosed = true;
956 }
957
958 /** A convenience call.
959 * @return A String representing the url of the initial url (root node of the mirrored tree).
960 */
961 public String toString() {
962 return download_url;
963 }
964
965 /** Called by the WGet native code to signal the current progress of
966 * downloading.
967 * @param current A long representing the number of bytes that have
968 * been downloaded since the last update.
969 * @param expected A long representing the total number of bytes
970 * expected for this download.
971 */
972 public void updateProgress(long current, long expected) {
973 progress.updateProgress(current, expected);
974 }
975
976
977 /*
978 Go through https://docs.oracle.com/javase/tutorial/essential/concurrency/atomicvars.html series of
979 Java articles on concurrency again.
980 Go through http://docs.oracle.com/javase/tutorial/uiswing/concurrency/
981
982 http://stackoverflow.com/questions/574240/is-there-an-advantage-to-use-a-synchronized-method-instead-of-a-synchronized-blo
983
984 "Not only do synchronized methods not lock the whole class, but they don't lock the whole instance either. Unsynchronized methods in the class may still proceed on the instance."
985 "Only the syncronized methods are locked. If there are fields you use within synced methods that are accessed by unsynced methods, you can run into race conditions."
986
987 "synchronizing on "this" is considered in some circles to be an anti-pattern. The unintended consequence is that outside of the class someone can lock on an object reference that is equal to "this" and prevent other threads from passing the barriers within the class potentially creating a deadlock situation. Creating a "private final Object = new Object();" variable purely for locking purposes is the often used solution. Here's another question relating directly to this issue. http://stackoverflow.com/questions/442564/avoid-synchronizedthis-in-java?lq=1"
988
989 "A private lock is a defensive mechanism, which is never a bad idea.
990
991 Also, as you alluded to, private locks can control granularity. One set of operations on an object might be totally unrelated to another but synchronized(this) will mutually exclude access to all of them."
992
993 http://stackoverflow.com/questions/8393883/is-synchronized-keyword-exception-safe
994 "In any scoped thread-safe block, the moment you get out of it, the thread-safety is gone."
995 "In case of an exception the lock will be released."
996
997 http://stackoverflow.com/questions/8259479/should-i-synchronize-listener-notifications-or-not
998 "Use a CopyOnWriteArrayList for your listener arrays."
999 "If you use the CopyOnWriteArrayList, then you don't have to synchronize when iterating."
1000 "CopyOnWriteArrayList is thread-safe, so there is no need to synchronize."
1001
1002 "Use a ConcurrentLinkedQueue<Listener> ... for this kind of problems: adding, removing and iterating simultaneously on a collection.
1003 A precision : this solution prevents a listener from being called from the very moment it is deregistered."
1004 "It means that you start iterating, an element is added, it will be called, another is removed, it won't, all this in the same iteration cycle.
1005 It's the best of both world: ensuring synchronization, while being fine grained on who gets called and who's not."
1006
1007 http://stackoverflow.com/questions/8260205/when-a-listener-is-removed-is-it-okay-that-the-event-be-called-on-that-listener
1008
1009 http://stackoverflow.com/questions/2282166/java-synchronizing-on-primitives
1010
1011 1. You can't lock on a primitive and
1012 2. Don't lock on a Long unless you're careful how you construct them. Long values created by autoboxing or Long.valueOf() in a certain range are guaranteed to be the same across the JVM which means other threads could be locking on the same exact Long object and giving you cross-talk. This can be a subtle concurrency bug (similar to locking on intern'ed strings).
1013
1014 Cross-talk:
1015 "In electronics, crosstalk is any phenomenon by which a signal transmitted on one circuit or channel of a transmission system creates an undesired effect in another circuit or channel. Crosstalk is usually caused by undesired capacitive, inductive, or conductive coupling from one circuit, part of a circuit, or channel, to another."
1016 */
1017
1018
1019 // Inner thread class that reads from process downloadfrom.pl's std output stream
1020 private class ProcessOutHandler extends SafeProcess.CustomProcessHandler {
1021
1022 public ProcessOutHandler() {
1023 super(SafeProcess.STDOUT);
1024 }
1025
1026 public void run(Closeable stream) {
1027 InputStream is = (InputStream) stream;
1028 BufferedReader eReader = null;
1029 try {
1030
1031 String message = null;
1032 eReader = new BufferedReader(new InputStreamReader(is));
1033 while(!Thread.currentThread().isInterrupted() && (message = eReader.readLine()) != null) {
1034 if(!message.equals("\n")) {
1035 System.err.println("**** Perl STDOUT: " + message);
1036 }
1037 }
1038 if(Thread.currentThread().isInterrupted()) {
1039 System.err.println("**** Perl INTERRUPTed.");
1040 } else {
1041 System.err.println("**** Perl ENDed.");
1042 }
1043
1044 } catch(Exception e) {
1045 System.err.println("Thread - caught exception: " + e);
1046 } finally {
1047 if(Thread.currentThread().isInterrupted()) {
1048 SafeProcess.log("@@@ Successfully interrupted " + Thread.currentThread().getName() + ".");
1049 }
1050 SafeProcess.closeResource(eReader);
1051 eReader = null;
1052 }
1053 }
1054 }
1055
1056
1057 private class ProcessErrHandler extends SafeProcess.CustomProcessHandler {
1058
1059 public ProcessErrHandler() {
1060 super(SafeProcess.STDERR);
1061 }
1062
1063 public void run(Closeable stream) {
1064 InputStream eis = (InputStream) stream;
1065
1066 BufferedReader br = null;
1067 try {
1068 br = new BufferedReader(new InputStreamReader(eis));
1069
1070 // Capture the standard error stream and search for two particular occurrences.
1071 String line="";
1072 boolean ignore_for_robots = false;
1073 int max_download = DownloadJob.UNKNOWN_MAX;
1074
1075 // handle to outer class objects that need synchronization (on either objects or their methods)
1076 DownloadProgressBar progress = DownloadJob.this.progress;
1077 AppendLineOnlyFileDocument download_log = DownloadJob.this.download_log;
1078
1079 while (!Thread.currentThread().isInterrupted() && (line = br.readLine()) != null
1080 && !line.trim().equals("<<Finished>>") /*&& !isStopped()*/) {
1081 if (max_download == DownloadJob.UNKNOWN_MAX) {
1082 if(line.lastIndexOf("<<Defined Maximum>>") != -1) {
1083 max_download = DownloadJob.DEFINED_MAX;
1084 }
1085 else if (line.lastIndexOf("<<Undefined Maximum>>") != -1) {
1086 max_download = DownloadJob.UNDEFINED_MAX;
1087 }
1088 }
1089 else if(max_download == DownloadJob.UNDEFINED_MAX) {
1090 DebugStream.println(line);
1091 download_log.appendLine(line); // now synchronized
1092 // The first magic special test is to see if we've just
1093 // asked for the robots.txt file. If so we ignore
1094 // the next add and then the next complete/error.
1095 if(line.lastIndexOf("robots.txt;") != -1) {
1096 DebugStream.println("***** Requesting robot.txt");
1097 ignore_for_robots = true;
1098 }
1099 // If line contains "=> `" display text as the
1100 // currently downloading url. Unique to add download.
1101 else if(line.lastIndexOf("=> `") != -1) {
1102 if(!ignore_for_robots) {
1103 // Add download
1104 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
1105
1106 // now synchronized
1107 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
1108 }
1109 }
1110 // If line contains "/s) - `" set currently
1111 // downloading url to "Download Complete".
1112 // Currently: on windows ` marks start quote of downloaded file, but on linux ' marks it
1113 else if(line.lastIndexOf("/s) - `") != -1 || line.lastIndexOf("/s) - '") != -1) {
1114 String startChar = (line.lastIndexOf("/s) - `") != -1) ? "`" : "'";
1115 String current_file_downloading = line.substring(line.indexOf(startChar) + 1, line.lastIndexOf("'"));
1116 if(!ignore_for_robots) {
1117 DebugStream.println("Not ignore for robots");
1118 // Download complete
1119 downloadComplete(current_file_downloading); // synchronized
1120 }
1121 else {
1122 DebugStream.println("Ignore for robots");
1123 ignore_for_robots = false;
1124 }
1125 }
1126 // The already there line begins "File `..." However this
1127 // is only true in english, so instead I looked and there
1128 // are few (if any at all) other messages than those above
1129 // and not overwriting messages that use " `" so we'll
1130 // look for that. Note this method is not guarenteed to be
1131 // unique like the previous two.
1132 else if(line.lastIndexOf(" `") != -1) {
1133 // Not Overwriting
1134 DebugStream.println("Already there.");
1135 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
1136
1137 progress.addDownload("file"); //addDownload("http:/" + new_url.substring(cachedir_prefix_length()-1));
1138 downloadWarning();
1139 }
1140 // Any other important message starts with the time in the form hh:mm:ss
1141 else if(line.length() > 7) {
1142 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
1143 if(!ignore_for_robots) {
1144 DebugStream.println("Error.");
1145 downloadFailed();
1146 }
1147 else {
1148 ignore_for_robots = false;
1149 }
1150 }
1151 }
1152 }
1153 else if (max_download == DownloadJob.DEFINED_MAX) {
1154 if (line.lastIndexOf("<<Total number of record(s):") != -1) {
1155 String total_ID = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
1156
1157 progress.setTotalDownload((Integer.valueOf(total_ID)).intValue());
1158 progress.resetFileCount();
1159 progress.addDownload("files"); // for display: "Downloading files"
1160
1161 }
1162 else if (line.lastIndexOf("<<Done>>") != -1) {
1163 progress.increaseFileCount();
1164 }
1165 else if(line.lastIndexOf("<<Done:") != -1) {
1166 String completed_amount = line.substring(line.indexOf(":") + 1, line.indexOf(">"));
1167 progress.increaseFileCount((Integer.valueOf(completed_amount)).intValue());
1168 }
1169
1170 DebugStream.println(line);
1171 download_log.appendLine(line);
1172 }
1173 else {
1174 System.out.println("Error!!");
1175 System.exit(-1);
1176 }
1177 }
1178
1179 } catch (IOException ioe) {
1180 //message(Utility.ERROR, ioe.toString());
1181 //JTest
1182 DebugStream.printStackTrace(ioe);
1183
1184 } finally {
1185 if(Thread.currentThread().isInterrupted()) { // if the thread this class is running in is interrupted
1186 SafeProcess.log("@@@ Successfully interrupted " + Thread.currentThread().getName() + ".");
1187 }
1188
1189 SafeProcess.closeResource(br);
1190 br = null;
1191 }
1192
1193 }
1194 }
1195}
Note: See TracBrowser for help on using the repository browser.