source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 6056

Last change on this file since 6056 was 5847, checked in by mdewsnip, 21 years ago

A much improved workspace tree that only refreshes when it really needs to (and only refreshes what it really needs to). This should prevent the five second plus refreshes on slow machines.

I'm planning to tidy up the collection tree in a similar way, when I get time.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Dictionary;
45import org.greenstone.gatherer.Gatherer;
46import org.greenstone.gatherer.WGet;
47import org.greenstone.gatherer.gui.GProgressBar;
48import org.greenstone.gatherer.util.GURL;
49import org.greenstone.gatherer.util.Utility;
50/**
51 * @author John Thompson, Greenstone Digital Library, University of Waikato
52 * @version 2.0
53 */
54public class Job
55 implements ActionListener {
56
57 private boolean clobber;
58 private boolean debug;
59 private boolean higher_directories;
60 private boolean no_parents;
61 private boolean other_hosts;
62 private boolean page_requisites;
63 private boolean quiet;
64
65 private GProgressBar progress;
66
67 private GURL initial = null;
68 private GURL url = null;
69
70 // private TreeModel model;
71
72 private int depth;
73 private int previous_state;
74 private int state;
75
76 private String current_url;
77 private String destination;
78 private String proxy_pass;
79 private String proxy_user;
80
81 private Vector encountered_urls;
82 private Vector failed_urls;
83
84 private WGet mummy;
85
86 public static int COMPLETE = 0;
87 public static int PAUSED = 1;
88 public static int RUNNING = 2;
89 public static int STOPPED = 3;
90
91 /**
92 */
93 public Job(/* TreeModel model, */ boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
94 // this.model = model;
95
96 this.debug = debug;
97 this.clobber = clobber;
98 this.no_parents = no_parents;
99 this.other_hosts = other_hosts;
100 this.page_requisites = page_requisites;
101 this.quiet = quiet;
102 this.initial = new GURL(initial);
103 this.depth = depth;
104 this.destination = destination;
105 this.proxy_pass = proxy_pass;
106 this.proxy_user = proxy_user;
107 this.mummy = mummy;
108
109 progress = new GProgressBar(this, initial.toString(), simple);
110
111 encountered_urls = new Vector();
112 failed_urls = new Vector();
113
114 previous_state = STOPPED;
115 state = STOPPED;
116 }
117
118 /** Depending on which button on the progress bar was pushed,
119 * this method will affect the state of the Job and perhaps make
120 * calls to wget.class if necessary.
121 * @param event The ActionEvent fired from within the GProgressBar
122 * which we must respond to.
123 */
124 public void actionPerformed(ActionEvent event) {
125 // The action button is used to alternately start or stop the
126 // job. If the current state of the job is paused then this
127 // restart is logically equivelent to a resume.
128 if(event.getSource() == progress.action) {
129 previous_state = state;
130 if(state == RUNNING) {
131 state = PAUSED;
132 }
133 else {
134 state = RUNNING;
135 mummy.resumeThread();
136 }
137 }
138 else if (event.getSource() == progress.cancel) {
139 state = STOPPED; // Should already be stopped.
140 mummy.deleteJob(this);
141 }
142 }
143
144 /** Called by the WGet native code to inform us of a new download starting.
145 * @param url The url that is being downloaded, as a String.
146 */
147 public void addDownload(String raw_url) {
148 if(!encountered_urls.contains(raw_url)) {
149 encountered_urls.add(raw_url);
150 }
151 // Regardless create a new GURL
152 current_url = raw_url;
153 url = new GURL(raw_url);
154 progress.addDownload(raw_url);
155 }
156
157 /** Used to advise the Job of a newly parsed link. Its up to Job
158 * to decide if it already knows about this url, and if not to
159 * update its progress bar.
160 * @param url The url in question as a String.
161 * @param type Whether the link is an internal or external link.
162 * @return A boolean indicating if the url was added.
163 */
164 public boolean addLink(String raw_url, int type) {
165 ///ystem.out.println("addLink("+url+", "+type+")");
166 if(!encountered_urls.contains(raw_url)) {
167 // Add it to the urls we've seen.
168 encountered_urls.add(raw_url);
169 // Add it the to links for the current GURL.
170
171 // Add it to the progress file count.
172 progress.increaseFileCount();
173 return true;
174 }
175 // Regardless add it to the children links of the current GURL
176 initial.addLink(raw_url);
177
178 // We've seen it before. Don't count it again.
179 return false;
180 }
181
182 public void callWGet() {
183 // Build parameter string
184 String command = "wget ";
185
186 // Parse arguments into array.
187 // Always:
188 // rewrite links to be local if possible - NOOOOOO,
189 // output a debug file and debug messages,
190 // run quietly.
191 //command = command + "-k ";
192
193 if(destination != null) {
194 command = command + "-P " + destination + " ";
195 }
196
197 if(depth < 0) {
198 // Infinite recursion
199 command = command + "-r ";
200 }
201 else if (depth == 0) {
202 // Just this page.
203 }
204 else if (depth > 0) {
205 // Recursion to the specified depth.
206 command = command + "-r -l" + depth + " ";
207 }
208
209 if(!clobber || previous_state == Job.PAUSED) {
210 command = command + "-nc -c ";
211 }
212
213 // Determine if we have to use a proxy.
214 if(Gatherer.config.get("general.use_proxy", true)) {
215 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
216 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
217 // Find out whether the user has already authenticated themselves
218 String user_pass = null;
219 String address = proxy_host + ":" + proxy_port;
220 int count = 0;
221 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
222 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
223 count++;
224 }
225 if(count >= 3) {
226 state = STOPPED;
227 return;
228 }
229 if(user_pass.indexOf("@") != -1) {
230 // Write the use proxy command
231 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
232
233 }
234 else {
235 Gatherer.println("Unknown user/pass");
236 }
237 }
238
239 if(page_requisites) {
240 command = command + "-p ";
241 }
242
243 if(other_hosts) {
244 command = command + "-H ";
245 }
246
247 // Finally tell it the site to download.
248 command = command + initial.toString();
249
250 if(previous_state == Job.COMPLETE) {
251 progress.mirrorBegun(true, true);
252 }
253 else {
254 progress.mirrorBegun(false, true);
255 }
256
257 // Run it
258 try {
259 Gatherer.println("Cmd: " + command);
260 Runtime rt = Runtime.getRuntime();
261 Process prcs = rt.exec(command);
262 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
263 BufferedReader br = new BufferedReader(isr);
264 // Capture the standard error stream and seach for two particular occurances.
265 String line;
266 boolean ignore_for_robots = false;
267 while ((line = br.readLine()) != null) {
268 Gatherer.println(line);
269
270 // The first magic special test is to see if we've just
271 // asked for the robots.txt file. If so we ignore
272 // the next add and then the next complete/error.
273 if(line.lastIndexOf("robots.txt;") != -1) {
274 Gatherer.println("***** Requesting robot.txt");
275 ignore_for_robots = true;
276 }
277 // If line contains "=> `" display text as the
278 // currently downloading url. Unique to add download.
279 else if(line.lastIndexOf("=> `") != -1) {
280 if(!ignore_for_robots) {
281 // Add download
282 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
283 // Remove the destination guff
284 if(destination != null) {
285 new_url = new_url.substring(destination.length());
286 }
287 addDownload("http:/" + new_url);
288 }
289 }
290 // If line contains "saved [<size>]" set currently
291 // downloading url to "Download Complete".
292 else if(line.lastIndexOf(") - `") != -1) {
293 if(!ignore_for_robots) {
294 // Download complete
295 downloadComplete();
296 }
297 else {
298 ignore_for_robots = false;
299 }
300 }
301 // The already there line begins "File `..." However this
302 // is only true in english, so instead I looked and there
303 // are few (if any at all) other messages than those above
304 // and not overwriting messages that use " `" so we'll
305 // look for that. Note this method is not guarenteed to be
306 // unique like the previous two.
307 else if(line.lastIndexOf(" `") != -1) {
308 // Not Overwriting
309 Gatherer.println("Already there.");
310 String new_url =
311 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
312 // For some strange reason this won't compile
313 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
314 // symbol : class CAKE
315 // location: class org.greenstone.gatherer.collection.Job
316 /* ***********************************************************
317 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
318 *********************************************************** */
319 // Remove the destination guff
320 if(destination != null) {
321 new_url = new_url.substring(destination.length());
322 }
323 addDownload("http:/" + new_url);
324 downloadWarning();
325 }
326 // Any other important message starts with the time in the form hh:mm:ss
327 else if(line.length() > 7) {
328 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
329 if(!ignore_for_robots) {
330 Gatherer.println("Error.");
331 downloadFailed();
332 }
333 else {
334 ignore_for_robots = false;
335 }
336 }
337 }
338 }
339 // Now display final message based on exit value
340 prcs.waitFor();
341 }
342 catch (Exception ioe) {
343 //message(Utility.ERROR, ioe.toString());
344 Gatherer.printStackTrace(ioe);
345 }
346 // If we've got to here and the state isn't STOPPED then the
347 // job is complete.
348 if(state == Job.RUNNING) {
349 progress.mirrorComplete();
350 previous_state = state;
351 state = Job.COMPLETE;
352 }
353 }
354
355 /** The most important part of the Job class, this method is
356 * responsible for calling the WGet native methods used to
357 * mirror the indicated url. By this stage all the variables
358 * necessary should be set and we need only build up the
359 * parameter string and make the call.
360 */
361 public void callWGetNative() {
362 Vector args = new Vector();
363
364 // Let the GProgressBar know we're starting, just in case
365 // the user hasn't told us to. If this is the second time the
366 // urls downloaded and the first attempt was successful (ie
367 // the previous job was complete), then we have the case where
368 // the user is forcing us to remirror. Reset all the values etc
369 // if this is the case then reset the variables.
370 // Note that this can cause the result line to look something
371 // like this.
372 // Downloaded 12 of 12 files (8 warnings, 0 errors).
373 // The warnings would be something like, 'File already downloaded'
374 // but the total number of files and the file successfully
375 // downloaded will be correct.
376 if(previous_state == Job.COMPLETE) {
377 progress.mirrorBegun(true, false);
378 }
379 else {
380 progress.mirrorBegun(false, false);
381 }
382
383 // Parse arguments into array.
384 args.add(Utility.BASE_DIR + "wget");
385 //args.add("-k");
386 args.add("-d");
387 args.add("-o");
388 args.add("debug.txt");
389
390 if(destination != null) {
391 args.add("-P");
392 args.add(destination);
393 }
394
395 if(depth < 0) {
396 // Infinite recursion
397 args.add("-r");
398 }
399 else if (depth == 0) {
400 // Just this page.
401 }
402 else if (depth > 0) {
403 // Recursion to the specified depth.
404 args.add("-r");
405 args.add("-l");
406 args.add("" + depth + ""); // Hacky
407 }
408
409 if(!clobber || previous_state == PAUSED) {
410 args.add("-nc");
411 args.add("-c");
412 }
413
414 if(proxy_user != null) {
415 args.add("--proxy-user=" + proxy_user);
416 args.add("--proxy-passwd=" + proxy_pass);
417 }
418
419 if(page_requisites) {
420 args.add("-p");
421 }
422
423 if(quiet) {
424 args.add("-q");
425 }
426
427 if(other_hosts) {
428 args.add("-H");
429 }
430
431 args.add(initial.toString());
432
433 Gatherer.println("Calling wget ");
434 for(Enumeration e = args.elements(); e.hasMoreElements();) {
435 Gatherer.println(e.nextElement() + " ");
436 }
437 Gatherer.println("");
438
439 // Run home to mummy.
440 int value = mummy.wget(args.size(), args.toArray(), debug);
441
442 // If we've got to here and the state isn't STOPPED then the job is complete.
443 if(state == RUNNING) {
444 progress.mirrorComplete();
445 previous_state = state;
446 state = COMPLETE;
447 }
448 }
449
450 /** Called by the WGet native code when the current download is
451 * completed. In turn all download listeners are informed.
452 */
453 public void downloadComplete() {
454 progress.downloadComplete();
455 /* @todo
456 model.add(url.getURL(), destination);
457 */
458 url = null;
459 current_url = null;
460 }
461
462 /** Called by the WGet native code when the requested download returns
463 * a status code other than 200.
464 */
465 public void downloadFailed() {
466 ///ystem.out.println("downloadFailed("+current_url+")");
467 failed_urls.add(current_url); // Its the current url thats failed.
468 progress.downloadFailed();
469 }
470
471 /**
472 */
473 public void downloadWarning() {
474 progress.downloadWarning();
475 }
476
477 /**
478 * @return A String representing the currently downloading url.
479 */
480 /* private String getCurrent() {
481 return current_url;
482 } */
483
484 /**
485 * @return A String representing the initial urls host (root node
486 * of tree that we are mirroring).
487 */
488 public String getHost() {
489 return url.getHost();
490 }
491
492 /**
493 * @return Returns the progress bar associated with this job.
494 */
495 public GProgressBar getProgressBar() {
496 return progress;
497 }
498
499 /** Called to discover if the user wanted this thread to run or if
500 * it is paused.
501 * @return An int representing the current Job state.
502 */
503 public int getState() {
504 return state;
505 }
506
507 /** Returns the current state of the stop flag for this job.
508 * @return A boolean representing whether the user has requested to
509 * stop.
510 */
511 public boolean hasSignalledStop() {
512 if(state == Job.STOPPED || state == Job.PAUSED ||
513 state == Job.COMPLETE) {
514 return true;
515 }
516 return false;
517 }
518
519 /** A convinence call.
520 * @return A String representing the url of the initial url (root node of the mirrored tree).
521 */
522 public String toString() {
523 return initial.toString();
524 }
525
526 /** Called by the WGet native code to signal the current progress of
527 * downloading.
528 * @param current A long representing the number of bytes that have
529 * been downloaded since the last update.
530 * @param expected A long representing the total number of bytes
531 * expected for this download.
532 */
533 public void updateProgress(long current, long expected) {
534 progress.updateProgress(current, expected);
535 }
536}
Note: See TracBrowser for help on using the repository browser.