source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 6622

Last change on this file since 6622 was 6622, checked in by jmt12, 20 years ago

More modifications to mirroring including testing for a valid version of Wget (and complaining if its missing or it is old) and rearranging buttons on the GProgressBar

  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Dictionary;
45import org.greenstone.gatherer.Gatherer;
46import org.greenstone.gatherer.WGet;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.file.FileSystemModel;
49import org.greenstone.gatherer.file.WorkspaceTreeModel;
50import org.greenstone.gatherer.gui.GProgressBar;
51import org.greenstone.gatherer.util.GURL;
52import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
53import org.greenstone.gatherer.util.Utility;
54/**
55 * @author John Thompson, Greenstone Digital Library, University of Waikato
56 * @version 2.0
57 */
58public class Job
59 implements ActionListener {
60
61 private boolean debug;
62 private boolean higher_directories;
63 private boolean no_parents;
64 private boolean other_hosts;
65 private boolean page_requisites;
66 private boolean quiet;
67
68 private GProgressBar progress;
69
70 private GURL initial = null;
71 private GURL url = null;
72
73 // private TreeModel model;
74
75 private int depth;
76 private int previous_state;
77 private int state;
78
79 private String current_url;
80 private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86
87 private WGet mummy;
88
89 public static int COMPLETE = 0;
90 public static int PAUSED = 1;
91 public static int RUNNING = 2;
92 public static int STOPPED = 3;
93
94 /**
95 */
96 public Job(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
97 // this.model = model;
98
99 this.debug = debug;
100 this.no_parents = no_parents;
101 this.other_hosts = other_hosts;
102 this.page_requisites = page_requisites;
103 this.quiet = quiet;
104 this.initial = new GURL(initial);
105 this.depth = depth;
106 this.destination = destination;
107 this.proxy_pass = proxy_pass;
108 this.proxy_user = proxy_user;
109 this.mummy = mummy;
110
111 progress = new GProgressBar(this, initial.toString(), simple);
112
113 encountered_urls = new Vector();
114 failed_urls = new Vector();
115
116 previous_state = STOPPED;
117 state = STOPPED;
118 }
119
120 /** Depending on which button on the progress bar was pushed,
121 * this method will affect the state of the Job and perhaps make
122 * calls to wget.class if necessary.
123 * @param event The ActionEvent fired from within the GProgressBar
124 * which we must respond to.
125 */
126 public void actionPerformed(ActionEvent event) {
127 // The action button is used to alternately start or stop the
128 // job. If the current state of the job is paused then this
129 // restart is logically equivelent to a resume.
130 if(event.getSource() == progress.action) {
131 previous_state = state;
132 if(state == RUNNING) {
133 state = PAUSED;
134 }
135 else {
136 state = RUNNING;
137 mummy.resumeThread();
138 }
139 }
140 else if (event.getSource() == progress.cancel) {
141 state = STOPPED; // Should already be stopped.
142 mummy.deleteJob(this);
143 }
144 }
145
146 /** Called by the WGet native code to inform us of a new download starting.
147 * @param url The url that is being downloaded, as a String.
148 */
149 public void addDownload(String raw_url) {
150 if(!encountered_urls.contains(raw_url)) {
151 encountered_urls.add(raw_url);
152 }
153 // Regardless create a new GURL
154 current_url = raw_url;
155 url = new GURL(raw_url);
156 progress.addDownload(raw_url);
157 }
158
159 /** Used to advise the Job of a newly parsed link. Its up to Job
160 * to decide if it already knows about this url, and if not to
161 * update its progress bar.
162 * @param url The url in question as a String.
163 * @param type Whether the link is an internal or external link.
164 * @return A boolean indicating if the url was added.
165 */
166 public boolean addLink(String raw_url, int type) {
167 ///ystem.out.println("addLink("+url+", "+type+")");
168 if(!encountered_urls.contains(raw_url)) {
169 // Add it to the urls we've seen.
170 encountered_urls.add(raw_url);
171 // Add it the to links for the current GURL.
172
173 // Add it to the progress file count.
174 progress.increaseFileCount();
175 return true;
176 }
177 // Regardless add it to the children links of the current GURL
178 initial.addLink(raw_url);
179
180 // We've seen it before. Don't count it again.
181 return false;
182 }
183
184 public void callWGet() {
185 // Build parameter string. Note that we never clobber, and we continue if possible
186 String command = Gatherer.config.getWGetPath() + " -nc -c ";
187
188 // Add the destination parameter
189 if(destination != null) {
190 command = command + "-P " + destination + " ";
191 }
192
193 if(depth < 0) {
194 // Infinite recursion
195 command = command + "-r ";
196 }
197 else if (depth == 0) {
198 // Just this page.
199 }
200 else if (depth > 0) {
201 // Recursion to the specified depth.
202 command = command + "-r -l" + depth + " ";
203 }
204
205 // Determine if we have to use a proxy.
206 if(Gatherer.config.get("general.use_proxy", true)) {
207 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
208 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
209 // Find out whether the user has already authenticated themselves
210 String user_pass = null;
211 String address = proxy_host + ":" + proxy_port;
212 int count = 0;
213 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
214 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
215 count++;
216 }
217 if(count >= 3) {
218 state = STOPPED;
219 return;
220 }
221 if(user_pass.indexOf("@") != -1) {
222 // Write the use proxy command
223 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=\"" + user_pass.substring(0, user_pass.indexOf("@")) + "\" --proxy-passwd=*** -Y on ";
224
225 }
226 else {
227 Gatherer.println("Unknown user/pass");
228 }
229 }
230
231 // The user can either choose to mirror all of the page requisites...
232 if(page_requisites) {
233 command = command + "-p ";
234 }
235 // or not. In which case we ensure links are rewritten.
236 else {
237 command = command + "-k ";
238 }
239
240 if(other_hosts) {
241 command = command + "-H ";
242 }
243
244 // Finally tell it the site to download.
245 command = command + initial.toString();
246
247 if(previous_state == Job.COMPLETE) {
248 progress.mirrorBegun(true, true);
249 }
250 else {
251 progress.mirrorBegun(false, true);
252 }
253
254 // Run it
255 try {
256 Gatherer.println("Cmd: " + command);
257 Runtime rt = Runtime.getRuntime();
258 Process prcs = rt.exec(command);
259 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
260 BufferedReader br = new BufferedReader(isr);
261 // Capture the standard error stream and seach for two particular occurances.
262 String line;
263 boolean ignore_for_robots = false;
264 while ((line = br.readLine()) != null) {
265 Gatherer.println(line);
266
267 // The first magic special test is to see if we've just
268 // asked for the robots.txt file. If so we ignore
269 // the next add and then the next complete/error.
270 if(line.lastIndexOf("robots.txt;") != -1) {
271 Gatherer.println("***** Requesting robot.txt");
272 ignore_for_robots = true;
273 }
274 // If line contains "=> `" display text as the
275 // currently downloading url. Unique to add download.
276 else if(line.lastIndexOf("=> `") != -1) {
277 if(!ignore_for_robots) {
278 // Add download
279 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
280 // Remove the destination guff
281 if(destination != null) {
282 new_url = new_url.substring(destination.length());
283 }
284 addDownload("http:/" + new_url);
285 }
286 }
287 // If line contains "/s) - `" set currently
288 // downloading url to "Download Complete".
289 else if(line.lastIndexOf("/s) - `") != -1) {
290 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
291 if(!ignore_for_robots) {
292 Gatherer.println("Not ignore for robots");
293 // Download complete
294 downloadComplete(current_file_downloading);
295 }
296 else {
297 Gatherer.println("Ignore for robots");
298 ignore_for_robots = false;
299 }
300 }
301 // The already there line begins "File `..." However this
302 // is only true in english, so instead I looked and there
303 // are few (if any at all) other messages than those above
304 // and not overwriting messages that use " `" so we'll
305 // look for that. Note this method is not guarenteed to be
306 // unique like the previous two.
307 else if(line.lastIndexOf(" `") != -1) {
308 // Not Overwriting
309 Gatherer.println("Already there.");
310 String new_url =
311 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
312 // For some strange reason this won't compile
313 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
314 // symbol : class CAKE
315 // location: class org.greenstone.gatherer.collection.Job
316 /* ***********************************************************
317 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
318 *********************************************************** */
319 // Remove the destination guff
320 if(destination != null) {
321 new_url = new_url.substring(destination.length());
322 }
323 addDownload("http:/" + new_url);
324 downloadWarning();
325 }
326 // Any other important message starts with the time in the form hh:mm:ss
327 else if(line.length() > 7) {
328 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
329 if(!ignore_for_robots) {
330 Gatherer.println("Error.");
331 downloadFailed();
332 }
333 else {
334 ignore_for_robots = false;
335 }
336 }
337 }
338 }
339 // Now display final message based on exit value
340 prcs.waitFor();
341 }
342 catch (Exception ioe) {
343 //message(Utility.ERROR, ioe.toString());
344 Gatherer.printStackTrace(ioe);
345 }
346 // If we've got to here and the state isn't STOPPED then the
347 // job is complete.
348 if(state == Job.RUNNING) {
349 progress.mirrorComplete();
350 previous_state = state;
351 state = Job.COMPLETE;
352 }
353 }
354
355 /** The most important part of the Job class, this method is
356 * responsible for calling the WGet native methods used to
357 * mirror the indicated url. By this stage all the variables
358 * necessary should be set and we need only build up the
359 * parameter string and make the call.
360 */
361 public void callWGetNative() {
362 Vector args = new Vector();
363
364 // Let the GProgressBar know we're starting, just in case
365 // the user hasn't told us to. If this is the second time the
366 // urls downloaded and the first attempt was successful (ie
367 // the previous job was complete), then we have the case where
368 // the user is forcing us to remirror. Reset all the values etc
369 // if this is the case then reset the variables.
370 // Note that this can cause the result line to look something
371 // like this.
372 // Downloaded 12 of 12 files (8 warnings, 0 errors).
373 // The warnings would be something like, 'File already downloaded'
374 // but the total number of files and the file successfully
375 // downloaded will be correct.
376 if(previous_state == Job.COMPLETE) {
377 progress.mirrorBegun(true, false);
378 }
379 else {
380 progress.mirrorBegun(false, false);
381 }
382
383 // Parse arguments into array.
384 args.add(Utility.BASE_DIR + "wget");
385 args.add("-d");
386 args.add("-o");
387 args.add("debug.txt");
388
389 if(destination != null) {
390 args.add("-P");
391 args.add(destination);
392 }
393
394 if(depth < 0) {
395 // Infinite recursion
396 args.add("-r");
397 }
398 else if (depth == 0) {
399 // Just this page.
400 }
401 else if (depth > 0) {
402 // Recursion to the specified depth.
403 args.add("-r");
404 args.add("-l");
405 args.add("" + depth + ""); // Hacky
406 }
407
408 if(previous_state == PAUSED) {
409 args.add("-nc");
410 args.add("-c");
411 }
412
413 if(proxy_user != null) {
414 args.add("--proxy-user=" + proxy_user);
415 args.add("--proxy-passwd=" + proxy_pass);
416 }
417
418 if(page_requisites) {
419 args.add("-p");
420 }
421
422 if(quiet) {
423 args.add("-q");
424 }
425
426 if(other_hosts) {
427 args.add("-H");
428 }
429
430 args.add(initial.toString());
431
432 Gatherer.println("Calling wget ");
433 for(Enumeration e = args.elements(); e.hasMoreElements();) {
434 Gatherer.println(e.nextElement() + " ");
435 }
436 Gatherer.println("");
437
438 // Run home to mummy.
439 int value = mummy.wget(args.size(), args.toArray(), debug);
440
441 // If we've got to here and the state isn't STOPPED then the job is complete.
442 if(state == RUNNING) {
443 progress.mirrorComplete();
444 previous_state = state;
445 state = COMPLETE;
446 }
447 }
448
449 /** Called by the WGet native code when the current download is
450 * completed. In turn all download listeners are informed.
451 */
452 public void downloadComplete() {
453 progress.downloadComplete();
454 url = null;
455 current_url = null;
456 }
457
458 public void downloadComplete(String current_file_downloading) {
459 progress.downloadComplete();
460 Gatherer.println("Current File: " + current_file_downloading);
461 //WorkspaceTreeModel.refreshWebCacheMappings();
462 if(Gatherer.g_man.collection_pane.workspace_tree != null) {
463 FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.collection_pane.workspace_tree.getModel();
464 File new_file = new File(current_file_downloading);
465 File parent_file = new_file.getParentFile();
466 String download_cache = Utility.getCacheDir().getAbsolutePath();
467 ArrayList raw_path = new ArrayList();
468 while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
469 raw_path.add(0, parent_file.getName());
470 parent_file = parent_file.getParentFile();
471 }
472 download_cache = null;
473 // Add download cache name
474 /** @todo - add to dictionary */
475 raw_path.add(0, "Mirroring.Mirror_Cache");
476 // And the root node
477 raw_path.add(0, tree_model.getRoot());
478 TreePath destination_path = new TreePath(raw_path.toArray());
479 raw_path = null;
480 // Retrieve the destination node
481 FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
482 // destination_path = null;
483 //FileNode new_file_node = new FileNode(new_file);
484
485 // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
486 ///atherer.println("Ready to insert new FileNode.");
487 Gatherer.println("Model: " + tree_model);
488 Gatherer.println("Destination path: " + destination_path);
489 destination_node.unmap();
490 ///atherer.println("Destination node: " + destination_node);
491 ///atherer.println("New node: " + new_file_node);
492 //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
493
494 //new_file_node = null;
495 destination_node = null;
496 tree_model = null;
497 }
498 url = null;
499 current_url = null;
500 }
501
502 /** Called by the WGet native code when the requested download returns
503 * a status code other than 200.
504 */
505 public void downloadFailed() {
506 ///ystem.out.println("downloadFailed("+current_url+")");
507 failed_urls.add(current_url); // Its the current url thats failed.
508 progress.downloadFailed();
509 }
510
511 /**
512 */
513 public void downloadWarning() {
514 progress.downloadWarning();
515 }
516
517 /**
518 * @return A String representing the currently downloading url.
519 */
520 /* private String getCurrent() {
521 return current_url;
522 } */
523
524 /**
525 * @return A String representing the initial urls host (root node
526 * of tree that we are mirroring).
527 */
528 public String getHost() {
529 return url.getHost();
530 }
531
532 /**
533 * @return Returns the progress bar associated with this job.
534 */
535 public GProgressBar getProgressBar() {
536 return progress;
537 }
538
539 /** Called to discover if the user wanted this thread to run or if
540 * it is paused.
541 * @return An int representing the current Job state.
542 */
543 public int getState() {
544 return state;
545 }
546
547 /** Returns the current state of the stop flag for this job.
548 * @return A boolean representing whether the user has requested to
549 * stop.
550 */
551 public boolean hasSignalledStop() {
552 if(state == Job.STOPPED || state == Job.PAUSED ||
553 state == Job.COMPLETE) {
554 return true;
555 }
556 return false;
557 }
558
559 public void setState(int state) {
560 previous_state = this.state;
561 this.state = state;
562 }
563
564 /** A convinence call.
565 * @return A String representing the url of the initial url (root node of the mirrored tree).
566 */
567 public String toString() {
568 return initial.toString();
569 }
570
571 /** Called by the WGet native code to signal the current progress of
572 * downloading.
573 * @param current A long representing the number of bytes that have
574 * been downloaded since the last update.
575 * @param expected A long representing the total number of bytes
576 * expected for this download.
577 */
578 public void updateProgress(long current, long expected) {
579 progress.updateProgress(current, expected);
580 }
581}
Note: See TracBrowser for help on using the repository browser.