source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 5593

Last change on this file since 5593 was 5593, checked in by mdewsnip, 21 years ago

Changed calls to the Dictionary.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Dictionary;
45import org.greenstone.gatherer.Gatherer;
46import org.greenstone.gatherer.WGet;
47import org.greenstone.gatherer.gui.GProgressBar;
48import org.greenstone.gatherer.util.GURL;
49import org.greenstone.gatherer.util.Utility;
50/**
51 * @author John Thompson, Greenstone Digital Library, University of Waikato
52 * @version 2.0
53 */
54public class Job
55 implements ActionListener {
56
57 private boolean clobber;
58 private boolean debug;
59 private boolean higher_directories;
60 private boolean no_parents;
61 private boolean other_hosts;
62 private boolean page_requisites;
63 private boolean quiet;
64
65 private GProgressBar progress;
66
67 private GURL initial = null;
68 private GURL url = null;
69
70 private TreeModel model;
71
72 private int depth;
73 private int previous_state;
74 private int state;
75
76 private String current_url;
77 private String destination;
78 private String proxy_pass;
79 private String proxy_user;
80
81 private Vector encountered_urls;
82 private Vector failed_urls;
83
84 private WGet mummy;
85
86 public static int COMPLETE = 0;
87 public static int PAUSED = 1;
88 public static int RUNNING = 2;
89 public static int STOPPED = 3;
90
91 /**
92 */
93 public Job(TreeModel model, boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
94 this.model = model;
95
96 this.debug = debug;
97 this.clobber = clobber;
98 this.no_parents = no_parents;
99 this.other_hosts = other_hosts;
100 this.page_requisites = page_requisites;
101 this.quiet = quiet;
102 this.initial = new GURL(initial);
103 this.depth = depth;
104 this.destination = destination;
105 this.proxy_pass = proxy_pass;
106 this.proxy_user = proxy_user;
107 this.mummy = mummy;
108
109 progress = new GProgressBar(this, initial.toString(), simple);
110
111 encountered_urls = new Vector();
112 failed_urls = new Vector();
113
114 previous_state = STOPPED;
115 state = STOPPED;
116 }
117
118 /** Depending on which button on the progress bar was pushed,
119 * this method will affect the state of the Job and perhaps make
120 * calls to wget.class if necessary.
121 * @param event The ActionEvent fired from within the GProgressBar
122 * which we must respond to.
123 */
124 public void actionPerformed(ActionEvent event) {
125 // The action button is used to alternately start or stop the
126 // job. If the current state of the job is paused then this
127 // restart is logically equivelent to a resume.
128 if(event.getSource() == progress.action) {
129 previous_state = state;
130 if(state == RUNNING) {
131 state = PAUSED;
132 }
133 else {
134 state = RUNNING;
135 mummy.resumeThread();
136 }
137 }
138 else if (event.getSource() == progress.cancel) {
139 state = STOPPED; // Should already be stopped.
140 mummy.deleteJob(this);
141 }
142 }
143
144 /** Called by the WGet native code to inform us of a new download starting.
145 * @param url The url that is being downloaded, as a String.
146 */
147 public void addDownload(String raw_url) {
148 if(!encountered_urls.contains(raw_url)) {
149 encountered_urls.add(raw_url);
150 }
151 // Regardless create a new GURL
152 current_url = raw_url;
153 url = new GURL(raw_url);
154 progress.addDownload(raw_url);
155 }
156
157 /** Used to advise the Job of a newly parsed link. Its up to Job
158 * to decide if it already knows about this url, and if not to
159 * update its progress bar.
160 * @param url The url in question as a String.
161 * @param type Whether the link is an internal or external link.
162 * @return A boolean indicating if the url was added.
163 */
164 public boolean addLink(String raw_url, int type) {
165 ///ystem.out.println("addLink("+url+", "+type+")");
166 if(!encountered_urls.contains(raw_url)) {
167 // Add it to the urls we've seen.
168 encountered_urls.add(raw_url);
169 // Add it the to links for the current GURL.
170
171 // Add it to the progress file count.
172 progress.increaseFileCount();
173 return true;
174 }
175 // Regardless add it to the children links of the current GURL
176 initial.addLink(raw_url);
177
178 // We've seen it before. Don't count it again.
179 return false;
180 }
181
182 public void callWGet() {
183 // Build parameter string
184 String command = "wget ";
185
186 // Parse arguments into array.
187 // Always:
188 // rewrite links to be local if possible - NOOOOOO,
189 // output a debug file and debug messages,
190 // run quietly.
191 //command = command + "-k ";
192
193 if(destination != null) {
194 command = command + "-P " + destination + " ";
195 }
196
197 if(depth < 0) {
198 // Infinite recursion
199 command = command + "-r ";
200 }
201 else if (depth == 0) {
202 // Just this page.
203 }
204 else if (depth > 0) {
205 // Recursion to the specified depth.
206 command = command + "-r -l" + depth + " ";
207 }
208
209 if(!clobber || previous_state == Job.PAUSED) {
210 command = command + "-nc -c ";
211 }
212
213 // Determine if we have to use a proxy.
214 if(Gatherer.config.get("general.use_proxy", true)) {
215 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
216 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
217 // Find out whether the user has already authenticated themselves
218 String user_pass = null;
219 String address = proxy_host + ":" + proxy_port;
220 int count = 0;
221 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
222 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
223 count++;
224 }
225 if(count >= 3) {
226 state = STOPPED;
227 return;
228 }
229 if(user_pass.indexOf("@") != -1) {
230 // Write the use proxy command
231 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
232
233 }
234 else {
235 Gatherer.println("Unknown user/pass");
236 }
237 }
238
239 if(page_requisites) {
240 command = command + "-p ";
241 }
242
243 if(other_hosts) {
244 command = command + "-H ";
245 }
246
247 // Finally tell it the site to download.
248 command = command + initial.toString();
249
250 if(previous_state == Job.COMPLETE) {
251 progress.mirrorBegun(true, true);
252 }
253 else {
254 progress.mirrorBegun(false, true);
255 }
256
257 // Run it
258 try {
259 Gatherer.println("Cmd: " + command);
260 Runtime rt = Runtime.getRuntime();
261 Process prcs = rt.exec(command);
262 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
263 BufferedReader br = new BufferedReader(isr);
264 // Capture the standard error stream and seach for two particular occurances.
265 String line;
266 boolean ignore_for_robots = false;
267 while ((line = br.readLine()) != null) {
268 Gatherer.println(line);
269
270 // The first magic special test is to see if we've just
271 // asked for the robots.txt file. If so we ignore
272 // the next add and then the next complete/error.
273 if(line.lastIndexOf("robots.txt;") != -1) {
274 Gatherer.println("***** Requesting robot.txt");
275 ignore_for_robots = true;
276 }
277 // If line contains "=> `" display text as the
278 // currently downloading url. Unique to add download.
279 else if(line.lastIndexOf("=> `") != -1) {
280 if(!ignore_for_robots) {
281 // Add download
282 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
283 // Remove the destination guff
284 if(destination != null) {
285 new_url = new_url.substring(destination.length());
286 }
287 addDownload("http:/" + new_url);
288 }
289 }
290 // If line contains "saved [<size>]" set currently
291 // downloading url to "Download Complete".
292 else if(line.lastIndexOf(") - `") != -1) {
293 if(!ignore_for_robots) {
294 // Download complete
295 downloadComplete();
296 }
297 else {
298 ignore_for_robots = false;
299 }
300 }
301 // The already there line begins "File `..." However this
302 // is only true in english, so instead I looked and there
303 // are few (if any at all) other messages than those above
304 // and not overwriting messages that use " `" so we'll
305 // look for that. Note this method is not guarenteed to be
306 // unique like the previous two.
307 else if(line.lastIndexOf(" `") != -1) {
308 // Not Overwriting
309 Gatherer.println("Already there.");
310 String new_url =
311 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
312 // For some strange reason this won't compile
313 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
314 // symbol : class CAKE
315 // location: class org.greenstone.gatherer.collection.Job
316 /* ***********************************************************
317 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
318 *********************************************************** */
319 // Remove the destination guff
320 if(destination != null) {
321 new_url = new_url.substring(destination.length());
322 }
323 addDownload("http:/" + new_url);
324 downloadWarning();
325 }
326 // Any other important message starts with the time in the form hh:mm:ss
327 else if(line.length() > 7) {
328 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
329 if(!ignore_for_robots) {
330 Gatherer.println("Error.");
331 downloadFailed();
332 }
333 else {
334 ignore_for_robots = false;
335 }
336 }
337 }
338 }
339 // Now display final message based on exit value
340 prcs.waitFor();
341 }
342 catch (Exception ioe) {
343 //message(Utility.ERROR, ioe.toString());
344 Gatherer.printStackTrace(ioe);
345 }
346 // If we've got to here and the state isn't STOPPED then the
347 // job is complete.
348 if(state == Job.RUNNING) {
349 progress.mirrorComplete();
350 previous_state = state;
351 state = Job.COMPLETE;
352 }
353 }
354
355 /** The most important part of the Job class, this method is
356 * responsible for calling the WGet native methods used to
357 * mirror the indicated url. By this stage all the variables
358 * necessary should be set and we need only build up the
359 * parameter string and make the call.
360 */
361 public void callWGetNative() {
362 Vector args = new Vector();
363
364 // Let the GProgressBar know we're starting, just in case
365 // the user hasn't told us to. If this is the second time the
366 // urls downloaded and the first attempt was successful (ie
367 // the previous job was complete), then we have the case where
368 // the user is forcing us to remirror. Reset all the values etc
369 // if this is the case then reset the variables.
370 // Note that this can cause the result line to look something
371 // like this.
372 // Downloaded 12 of 12 files (8 warnings, 0 errors).
373 // The warnings would be something like, 'File already downloaded'
374 // but the total number of files and the file successfully
375 // downloaded will be correct.
376 if(previous_state == Job.COMPLETE) {
377 progress.mirrorBegun(true, false);
378 }
379 else {
380 progress.mirrorBegun(false, false);
381 }
382
383 // Parse arguments into array.
384 args.add(Utility.BASE_DIR + "wget");
385 //args.add("-k");
386 args.add("-d");
387 args.add("-o");
388 args.add("debug.txt");
389
390 if(destination != null) {
391 args.add("-P");
392 args.add(destination);
393 }
394
395 if(depth < 0) {
396 // Infinite recursion
397 args.add("-r");
398 }
399 else if (depth == 0) {
400 // Just this page.
401 }
402 else if (depth > 0) {
403 // Recursion to the specified depth.
404 args.add("-r");
405 args.add("-l");
406 args.add("" + depth + ""); // Hacky
407 }
408
409 if(!clobber || previous_state == PAUSED) {
410 args.add("-nc");
411 args.add("-c");
412 }
413
414 if(proxy_user != null) {
415 args.add("--proxy-user=" + proxy_user);
416 args.add("--proxy-passwd=" + proxy_pass);
417 }
418
419 if(page_requisites) {
420 args.add("-p");
421 }
422
423 if(quiet) {
424 args.add("-q");
425 }
426
427 if(other_hosts) {
428 args.add("-H");
429 }
430
431 args.add(initial.toString());
432
433 Gatherer.println("Calling wget ");
434 for(Enumeration e = args.elements(); e.hasMoreElements();) {
435 Gatherer.println(e.nextElement() + " ");
436 }
437 Gatherer.println("");
438
439 // Run home to mummy.
440 int value = mummy.wget(args.size(), args.toArray(), debug);
441
442 // If we've got to here and the state isn't STOPPED then the job is complete.
443 if(state == RUNNING) {
444 progress.mirrorComplete();
445 previous_state = state;
446 state = COMPLETE;
447 }
448 }
449
450 /** Called by the WGet native code when the current download is
451 * completed. In turn all download listeners are informed.
452 */
453 public void downloadComplete() {
454 progress.downloadComplete();
455 /* @todo
456 model.add(url.getURL(), destination);
457 */
458 url = null;
459 current_url = null;
460 }
461
462 /** Called by the WGet native code when the requested download returns
463 * a status code other than 200.
464 */
465 public void downloadFailed() {
466 ///ystem.out.println("downloadFailed("+current_url+")");
467 failed_urls.add(current_url); // Its the current url thats failed.
468 progress.downloadFailed();
469 }
470
471 /**
472 */
473 public void downloadWarning() {
474 progress.downloadWarning();
475 }
476
477 /**
478 * @return A String representing the currently downloading url.
479 */
480 public String getCurrent() {
481 return current_url;
482 }
483
484 /**
485 * @return A String representing the initial urls host (root node
486 * of tree that we are mirroring).
487 */
488 public String getHost() {
489 return url.getHost();
490 }
491
492 /**
493 * @return Returns the progress bar associated with this job.
494 */
495 public GProgressBar getProgressBar() {
496 return progress;
497 }
498
499 /** Called to discover if the user wanted this thread to run or if
500 * it is paused.
501 * @return An int representing the current Job state.
502 */
503 public int getState() {
504 return state;
505 }
506
507 /** Returns the current state of the stop flag for this job.
508 * @return A boolean representing whether the user has requested to
509 * stop.
510 */
511 public boolean hasSignalledStop() {
512 if(state == Job.STOPPED || state == Job.PAUSED ||
513 state == Job.COMPLETE) {
514 return true;
515 }
516 return false;
517 }
518
519 /** A convinence call.
520 * @return A String representing the url of the initial url (root node of the mirrored tree).
521 */
522 public String toString() {
523 return initial.toString();
524 }
525
526 /** Called by the WGet native code to signal the current progress of
527 * downloading.
528 * @param current A long representing the number of bytes that have
529 * been downloaded since the last update.
530 * @param expected A long representing the total number of bytes
531 * expected for this download.
532 */
533 public void updateProgress(long current, long expected) {
534 progress.updateProgress(current, expected);
535 }
536}
Note: See TracBrowser for help on using the repository browser.