source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 5319

Last change on this file since 5319 was 5319, checked in by jmt12, 21 years ago

Rewrote to match WGet, and also debugged proxy problems. Also fixed difference between page requisite destination folders

  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1package org.greenstone.gatherer.collection;
2/**
3 *#########################################################################
4 *
5 * A component of the Gatherer application, part of the Greenstone digital
6 * library suite from the New Zealand Digital Library Project at the
7 * University of Waikato, New Zealand.
8 *
9 * <BR><BR>
10 *
11 * Author: John Thompson, Greenstone Digital Library, University of Waikato
12 *
13 * <BR><BR>
14 *
15 * Copyright (C) 1999 New Zealand Digital Library Project
16 *
17 * <BR><BR>
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation; either version 2 of the License, or
22 * (at your option) any later version.
23 *
24 * <BR><BR>
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * <BR><BR>
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
36 *########################################################################
37 */
38import java.awt.event.*;
39import java.io.*;
40import java.net.*;
41import java.util.*;
42import javax.swing.tree.*;
43import org.greenstone.gatherer.Gatherer;
44import org.greenstone.gatherer.WGet;
45import org.greenstone.gatherer.gui.GProgressBar;
46import org.greenstone.gatherer.util.GURL;
47import org.greenstone.gatherer.util.Utility;
48/**
49 * @author John Thompson, Greenstone Digital Library, University of Waikato
50 * @version 2.0
51 */
52public class Job
53 implements ActionListener {
54
55 private boolean clobber;
56 private boolean debug;
57 private boolean higher_directories;
58 private boolean no_parents;
59 private boolean other_hosts;
60 private boolean page_requisites;
61 private boolean quiet;
62
63 private GProgressBar progress;
64
65 private GURL initial = null;
66 private GURL url = null;
67
68 private TreeModel model;
69
70 private int depth;
71 private int previous_state;
72 private int state;
73
74 private String current_url;
75 private String destination;
76 private String proxy_pass;
77 private String proxy_user;
78
79 private Vector encountered_urls;
80 private Vector failed_urls;
81
82 private WGet mummy;
83
84 public static int COMPLETE = 0;
85 public static int PAUSED = 1;
86 public static int RUNNING = 2;
87 public static int STOPPED = 3;
88
89 /**
90 */
91 public Job(TreeModel model, boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
92 this.model = model;
93
94 this.debug = debug;
95 this.clobber = clobber;
96 this.no_parents = no_parents;
97 this.other_hosts = other_hosts;
98 this.page_requisites = page_requisites;
99 this.quiet = quiet;
100 this.initial = new GURL(initial);
101 this.depth = depth;
102 this.destination = destination;
103 this.proxy_pass = proxy_pass;
104 this.proxy_user = proxy_user;
105 this.mummy = mummy;
106
107 progress = new GProgressBar(this, initial.toString(), simple);
108
109 encountered_urls = new Vector();
110 failed_urls = new Vector();
111
112 previous_state = STOPPED;
113 state = STOPPED;
114 }
115
116 /** Depending on which button on the progress bar was pushed,
117 * this method will affect the state of the Job and perhaps make
118 * calls to wget.class if necessary.
119 * @param event The ActionEvent fired from within the GProgressBar
120 * which we must respond to.
121 */
122 public void actionPerformed(ActionEvent event) {
123 // The action button is used to alternately start or stop the
124 // job. If the current state of the job is paused then this
125 // restart is logically equivelent to a resume.
126 if(event.getSource() == progress.action) {
127 previous_state = state;
128 if(state == RUNNING) {
129 state = PAUSED;
130 }
131 else {
132 state = RUNNING;
133 mummy.resumeThread();
134 }
135 }
136 else if (event.getSource() == progress.cancel) {
137 state = STOPPED; // Should already be stopped.
138 mummy.deleteJob(this);
139 }
140 }
141
142 /** Called by the WGet native code to inform us of a new download starting.
143 * @param url The url that is being downloaded, as a String.
144 */
145 public void addDownload(String raw_url) {
146 if(!encountered_urls.contains(raw_url)) {
147 encountered_urls.add(raw_url);
148 }
149 // Regardless create a new GURL
150 current_url = raw_url;
151 url = new GURL(raw_url);
152 progress.addDownload(raw_url);
153 }
154
155 /** Used to advise the Job of a newly parsed link. Its up to Job
156 * to decide if it already knows about this url, and if not to
157 * update its progress bar.
158 * @param url The url in question as a String.
159 * @param type Whether the link is an internal or external link.
160 * @return A boolean indicating if the url was added.
161 */
162 public boolean addLink(String raw_url, int type) {
163 ///ystem.out.println("addLink("+url+", "+type+")");
164 if(!encountered_urls.contains(raw_url)) {
165 // Add it to the urls we've seen.
166 encountered_urls.add(raw_url);
167 // Add it the to links for the current GURL.
168
169 // Add it to the progress file count.
170 progress.increaseFileCount();
171 return true;
172 }
173 // Regardless add it to the children links of the current GURL
174 initial.addLink(raw_url);
175
176 // We've seen it before. Don't count it again.
177 return false;
178 }
179
180 public void callWGet() {
181 // Build parameter string
182 String command = "wget ";
183
184 // Parse arguments into array.
185 // Always:
186 // rewrite links to be local if possible - NOOOOOO,
187 // output a debug file and debug messages,
188 // run quietly.
189 //command = command + "-k ";
190
191 if(destination != null) {
192 command = command + "-P " + destination + " ";
193 }
194
195 if(depth < 0) {
196 // Infinite recursion
197 command = command + "-r ";
198 }
199 else if (depth == 0) {
200 // Just this page.
201 }
202 else if (depth > 0) {
203 // Recursion to the specified depth.
204 command = command + "-r -l" + depth + " ";
205 }
206
207 if(!clobber || previous_state == Job.PAUSED) {
208 command = command + "-nc -c ";
209 }
210
211 // Determine if we have to use a proxy.
212 if(Gatherer.config.get("general.use_proxy", true)) {
213 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
214 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
215 // Find out whether the user has already authenticated themselves
216 String user_pass = null;
217 String address = proxy_host + ":" + proxy_port;
218 System.err.println("Searching for authentication for: " + address);
219 int count = 0;
220 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
221 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Gatherer.dictionary.get("WGet.Prompt"), "HTTP");
222 count++;
223 }
224 if(count >= 3) {
225 state = STOPPED;
226 return;
227 }
228 System.err.println("login@pass: " + user_pass);
229 if(user_pass.indexOf("@") != -1) {
230 // Write the use proxy command
231 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
232
233 }
234 else {
235 Gatherer.println("Unknown user/pass");
236 }
237 }
238
239 if(page_requisites) {
240 command = command + "-p ";
241 }
242
243 if(other_hosts) {
244 command = command + "-H ";
245 }
246
247 // Finally tell it the site to download.
248 command = command + initial.toString();
249
250 if(previous_state == Job.COMPLETE) {
251 progress.mirrorBegun(true, true);
252 }
253 else {
254 progress.mirrorBegun(false, true);
255 }
256
257 // Run it
258 try {
259 Gatherer.println("Cmd: " + command);
260 Runtime rt = Runtime.getRuntime();
261 Process prcs = rt.exec(command);
262 InputStreamReader isr =
263 new InputStreamReader( prcs.getErrorStream() );
264 BufferedReader br = new BufferedReader( isr );
265 // Capture the standard error stream and seach for two particular
266 // occurances.
267 String line;
268 boolean ignore_for_robots = false;
269 while ((line = br.readLine()) != null) {
270 Gatherer.println(line);
271
272 // The first magic special test is to see if we've just
273 // asked for the robots.txt file. If so we ignore
274 // the next add and then the next complete/error.
275 if(line.lastIndexOf("robots.txt;") != -1) {
276 Gatherer.println("***** Requesting robot.txt");
277 ignore_for_robots = true;
278 }
279 // If line contains "=> `" display text as the
280 // currently downloading url. Unique to add download.
281 else if(line.lastIndexOf("=> `") != -1) {
282 if(!ignore_for_robots) {
283 // Add download
284 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
285 // Remove the destination guff
286 if(destination != null) {
287 new_url = new_url.substring(destination.length());
288 }
289 addDownload("http:/" + new_url);
290 }
291 }
292 // If line contains "saved [<size>]" set currently
293 // downloading url to "Download Complete".
294 else if(line.lastIndexOf(") - `") != -1) {
295 if(!ignore_for_robots) {
296 // Download complete
297 downloadComplete();
298 }
299 else {
300 ignore_for_robots = false;
301 }
302 }
303 // The already there line begins "File `..." However this
304 // is only true in english, so instead I looked and there
305 // are few (if any at all) other messages than those above
306 // and not overwriting messages that use " `" so we'll
307 // look for that. Note this method is not guarenteed to be
308 // unique like the previous two.
309 else if(line.lastIndexOf(" `") != -1) {
310 // Not Overwriting
311 Gatherer.println("Already there.");
312 String new_url =
313 line.substring(line.indexOf("`") + 1,
314 line.lastIndexOf("'"));
315 // For some strange reason this won't compile
316 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
317 // symbol : class CAKE
318 // location: class org.greenstone.gatherer.collection.Job
319 /* ***********************************************************
320 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
321 *********************************************************** */
322 // Remove the destination guff
323 if(destination != null) {
324 new_url = new_url.substring(destination.length());
325 }
326 addDownload("http:/" + new_url);
327 downloadWarning();
328 }
329 // Any other important message starts with the time in the form hh:mm:ss
330 else if(line.length() > 7) {
331 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
332 if(!ignore_for_robots) {
333 Gatherer.println("Error.");
334 downloadFailed();
335 }
336 else {
337 ignore_for_robots = false;
338 }
339 }
340 }
341 }
342 // Now display final message based on exit value
343 prcs.waitFor();
344 } catch (Exception ioe) {
345 //message(Utility.ERROR, ioe.toString());
346 Gatherer.printStackTrace(ioe);
347 }
348 // If we've got to here and the state isn't STOPPED then the
349 // job is complete.
350 if(state == Job.RUNNING) {
351 progress.mirrorComplete();
352 previous_state = state;
353 state = Job.COMPLETE;
354 }
355 }
356
357 /** The most important part of the Job class, this method is
358 * responsible for calling the WGet native methods used to
359 * mirror the indicated url. By this stage all the variables
360 * necessary should be set and we need only build up the
361 * parameter string and make the call.
362 */
363 public void callWGetNative() {
364 Vector args = new Vector();
365
366 // Let the GProgressBar know we're starting, just in case
367 // the user hasn't told us to. If this is the second time the
368 // urls downloaded and the first attempt was successful (ie
369 // the previous job was complete), then we have the case where
370 // the user is forcing us to remirror. Reset all the values etc
371 // if this is the case then reset the variables.
372 // Note that this can cause the result line to look something
373 // like this.
374 // Downloaded 12 of 12 files (8 warnings, 0 errors).
375 // The warnings would be something like, 'File already downloaded'
376 // but the total number of files and the file successfully
377 // downloaded will be correct.
378 if(previous_state == Job.COMPLETE) {
379 progress.mirrorBegun(true, false);
380 }
381 else {
382 progress.mirrorBegun(false, false);
383 }
384
385 // Parse arguments into array.
386 args.add(Utility.BASE_DIR + "wget");
387 //args.add("-k");
388 args.add("-d");
389 args.add("-o");
390 args.add("debug.txt");
391
392 if(destination != null) {
393 args.add("-P");
394 args.add(destination);
395 }
396
397 if(depth < 0) {
398 // Infinite recursion
399 args.add("-r");
400 }
401 else if (depth == 0) {
402 // Just this page.
403 }
404 else if (depth > 0) {
405 // Recursion to the specified depth.
406 args.add("-r");
407 args.add("-l");
408 args.add("" + depth + ""); // Hacky
409 }
410
411 if(!clobber || previous_state == PAUSED) {
412 args.add("-nc");
413 args.add("-c");
414 }
415
416 if(proxy_user != null) {
417 args.add("--proxy-user=" + proxy_user);
418 args.add("--proxy-passwd=" + proxy_pass);
419 }
420
421 if(page_requisites) {
422 args.add("-p");
423 }
424
425 if(quiet) {
426 args.add("-q");
427 }
428
429 if(other_hosts) {
430 args.add("-H");
431 }
432
433 args.add(initial.toString());
434
435 Gatherer.println("Calling wget ");
436 for(Enumeration e = args.elements(); e.hasMoreElements();) {
437 Gatherer.println(e.nextElement() + " ");
438 }
439 Gatherer.println("");
440
441 // Run home to mummy.
442 int value = mummy.wget(args.size(), args.toArray(), debug);
443
444 // If we've got to here and the state isn't STOPPED then the job is complete.
445 if(state == RUNNING) {
446 progress.mirrorComplete();
447 previous_state = state;
448 state = COMPLETE;
449 }
450 }
451
452 /** Called by the WGet native code when the current download is
453 * completed. In turn all download listeners are informed.
454 */
455 public void downloadComplete() {
456 progress.downloadComplete();
457 /* @todo
458 model.add(url.getURL(), destination);
459 */
460 url = null;
461 current_url = null;
462 }
463
464 /** Called by the WGet native code when the requested download returns
465 * a status code other than 200.
466 */
467 public void downloadFailed() {
468 ///ystem.out.println("downloadFailed("+current_url+")");
469 failed_urls.add(current_url); // Its the current url thats failed.
470 progress.downloadFailed();
471 }
472
473 /**
474 */
475 public void downloadWarning() {
476 progress.downloadWarning();
477 }
478
479 /**
480 * @return A String representing the currently downloading url.
481 */
482 public String getCurrent() {
483 return current_url;
484 }
485
486 /**
487 * @return A String representing the initial urls host (root node
488 * of tree that we are mirroring).
489 */
490 public String getHost() {
491 return url.getHost();
492 }
493
494 /**
495 * @return Returns the progress bar associated with this job.
496 */
497 public GProgressBar getProgressBar() {
498 return progress;
499 }
500
501 /** Called to discover if the user wanted this thread to run or if
502 * it is paused.
503 * @return An int representing the current Job state.
504 */
505 public int getState() {
506 return state;
507 }
508
509 /** Returns the current state of the stop flag for this job.
510 * @return A boolean representing whether the user has requested to
511 * stop.
512 */
513 public boolean hasSignalledStop() {
514 if(state == Job.STOPPED || state == Job.PAUSED ||
515 state == Job.COMPLETE) {
516 return true;
517 }
518 return false;
519 }
520
521 /** A convinence call.
522 * @return A String representing the url of the initial url (root node of the mirrored tree).
523 */
524 public String toString() {
525 return initial.toString();
526 }
527
528 /** Called by the WGet native code to signal the current progress of
529 * downloading.
530 * @param current A long representing the number of bytes that have
531 * been downloaded since the last update.
532 * @param expected A long representing the total number of bytes
533 * expected for this download.
534 */
535 public void updateProgress(long current, long expected) {
536 progress.updateProgress(current, expected);
537 }
538}
539
540
541
542
Note: See TracBrowser for help on using the repository browser.