source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 4366

Last change on this file since 4366 was 4366, checked in by kjdon, 21 years ago

re-tabbed the code for java

  • Property svn:keywords set to Author Date Id Revision
File size: 14.2 KB
Line 
1package org.greenstone.gatherer.collection;
2/**
3 *#########################################################################
4 *
5 * A component of the Gatherer application, part of the Greenstone digital
6 * library suite from the New Zealand Digital Library Project at the
7 * University of Waikato, New Zealand.
8 *
9 * <BR><BR>
10 *
11 * Author: John Thompson, Greenstone Digital Library, University of Waikato
12 *
13 * <BR><BR>
14 *
15 * Copyright (C) 1999 New Zealand Digital Library Project
16 *
17 * <BR><BR>
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation; either version 2 of the License, or
22 * (at your option) any later version.
23 *
24 * <BR><BR>
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * <BR><BR>
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
36 *########################################################################
37 */
38import java.awt.event.*;
39import java.io.*;
40import java.util.*;
41import javax.swing.tree.*;
42import org.greenstone.gatherer.Gatherer;
43import org.greenstone.gatherer.WGet;
44import org.greenstone.gatherer.gui.GProgressBar;
45import org.greenstone.gatherer.util.GURL;
46import org.greenstone.gatherer.util.Utility;
47/**
48 * @author John Thompson, Greenstone Digital Library, University of Waikato
49 * @version 2.0
50 */
51public class Job
52 implements ActionListener {
53
54 private boolean clobber;
55 private boolean debug;
56 private boolean higher_directories;
57 private boolean no_parents;
58 private boolean other_hosts;
59 private boolean page_requisites;
60 private boolean quiet;
61
62 private GProgressBar progress;
63
64 private GURL initial = null;
65 private GURL url = null;
66
67 private TreeModel model;
68
69 private int depth;
70 private int previous_state;
71 private int state;
72
73 private String current_url;
74 private String destination;
75 private String proxy_pass;
76 private String proxy_user;
77
78 private Vector encountered_urls;
79 private Vector failed_urls;
80
81 private WGet mummy;
82
83 public static int COMPLETE = 0;
84 public static int PAUSED = 1;
85 public static int RUNNING = 2;
86 public static int STOPPED = 3;
87
88 /**
89 */
90 public Job(TreeModel model, boolean clobber, boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, GURL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
91 this.model = model;
92
93 this.debug = debug;
94 this.clobber = clobber;
95 this.no_parents = no_parents;
96 this.other_hosts = other_hosts;
97 this.page_requisites = page_requisites;
98 this.quiet = quiet;
99 this.initial = initial;
100 this.depth = depth;
101 this.destination = destination;
102 this.proxy_pass = proxy_pass;
103 this.proxy_user = proxy_user;
104 this.mummy = mummy;
105
106 progress = new GProgressBar(this, initial.toString(), simple);
107
108 encountered_urls = new Vector();
109 failed_urls = new Vector();
110
111 previous_state = STOPPED;
112 state = STOPPED;
113 }
114
115 /** Depending on which button on the progress bar was pushed,
116 * this method will affect the state of the Job and perhaps make
117 * calls to wget.class if necessary.
118 * @param event The ActionEvent fired from within the GProgressBar
119 * which we must respond to.
120 */
121 public void actionPerformed(ActionEvent event) {
122 // The action button is used to alternately start or stop the
123 // job. If the current state of the job is paused then this
124 // restart is logically equivelent to a resume.
125 if(event.getSource() == progress.action) {
126 previous_state = state;
127 if(state == RUNNING) {
128 state = PAUSED;
129 }
130 else {
131 state = RUNNING;
132 mummy.resumeThread();
133 }
134 }
135 else if (event.getSource() == progress.cancel) {
136 state = STOPPED; // Should already be stopped.
137 mummy.deleteJob(this);
138 }
139 }
140
141 /** Called by the WGet native code to inform us of a new download starting.
142 * @param url The url that is being downloaded, as a String.
143 */
144 public void addDownload(String raw_url) {
145 if(!encountered_urls.contains(raw_url)) {
146 encountered_urls.add(raw_url);
147 }
148 // Regardless create a new GURL
149 current_url = raw_url;
150 url = new GURL(raw_url);
151 progress.addDownload(raw_url);
152 }
153
154 /** Used to advise the Job of a newly parsed link. Its up to Job
155 * to decide if it already knows about this url, and if not to
156 * update its progress bar.
157 * @param url The url in question as a String.
158 * @param type Whether the link is an internal or external link.
159 * @return A boolean indicating if the url was added.
160 */
161 public boolean addLink(String raw_url, int type) {
162 ///ystem.out.println("addLink("+url+", "+type+")");
163 if(!encountered_urls.contains(raw_url)) {
164 // Add it to the urls we've seen.
165 encountered_urls.add(raw_url);
166 // Add it the to links for the current GURL.
167
168 // Add it to the progress file count.
169 progress.increaseFileCount();
170 return true;
171 }
172 // Regardless add it to the children links of the current GURL
173 initial.addLink(raw_url);
174
175 // We've seen it before. Don't count it again.
176 return false;
177 }
178
179 public void callWGet() {
180 // Build parameter string
181 String command = "wget ";
182
183 // Parse arguments into array.
184 // Always:
185 // rewrite links to be local if possible - NOOOOOO,
186 // output a debug file and debug messages,
187 // run quietly.
188 //command = command + "-k ";
189
190 if(destination != null) {
191 command = command + "-P " + destination + " ";
192 }
193
194 if(depth < 0) {
195 // Infinite recursion
196 command = command + "-r ";
197 }
198 else if (depth == 0) {
199 // Just this page.
200 }
201 else if (depth > 0) {
202 // Recursion to the specified depth.
203 command = command + "-r -l" + depth + " ";
204 }
205
206 if(!clobber || previous_state == Job.PAUSED) {
207 command = command + "-nc -c ";
208 }
209
210 if(proxy_user != null) {
211 command = command + "--proxy-user=" + proxy_user
212 + " --proxy-passwd=" + proxy_pass + " ";
213 }
214
215 if(page_requisites) {
216 command = command + "-p ";
217 }
218
219 if(other_hosts) {
220 command = command + "-H ";
221 }
222
223 // Finally tell it the site to download.
224 command = command + initial.toString();
225
226 if(previous_state == Job.COMPLETE) {
227 progress.mirrorBegun(true);
228 }
229 else {
230 progress.mirrorBegun(false);
231 }
232
233 // Run it
234 try {
235 Gatherer.println("Cmd: " + command);
236 Runtime rt = Runtime.getRuntime();
237 Process prcs = rt.exec(command);
238 InputStreamReader isr =
239 new InputStreamReader( prcs.getErrorStream() );
240 BufferedReader br = new BufferedReader( isr );
241 // Capture the standard error stream and seach for two particular
242 // occurances.
243 String line;
244 boolean ignore_for_robots = false;
245 while ((line = br.readLine()) != null) {
246 Gatherer.println(line);
247
248 // The first magic special test is to see if we've just
249 // asked for the robots.txt file. If so we ignore
250 // the next add and then the next complete/error.
251 if(line.lastIndexOf("robots.txt;") != -1) {
252 Gatherer.println("***** Requesting robot.txt");
253 ignore_for_robots = true;
254 }
255 // If line contains "=> `" display text as the
256 // currently downloading url. Unique to add download.
257 else if(line.lastIndexOf("=> `") != -1) {
258 if(!ignore_for_robots) {
259 // Add download
260 String new_url =
261 line.substring(line.indexOf("`") + 1,
262 line.lastIndexOf("'"));
263 // Remove the destination guff
264 if(destination != null) {
265 new_url = new_url.substring(destination.length());
266 }
267 addDownload("http:/" + new_url);
268 }
269 }
270 // If line contains "saved [<size>]" set currently
271 // downloading url to "Download Complete".
272 else if(line.lastIndexOf(") - `") != -1) {
273 if(!ignore_for_robots) {
274 // Download complete
275 downloadComplete();
276 }
277 else {
278 ignore_for_robots = false;
279 }
280 }
281 // The already there line begins "File `..." However this
282 // is only true in english, so instead I looked and there
283 // are few (if any at all) other messages than those above
284 // and not overwriting messages that use " `" so we'll
285 // look for that. Note this method is not guarenteed to be
286 // unique like the previous two.
287 else if(line.lastIndexOf(" `") != -1) {
288 // Not Overwriting
289 Gatherer.println("Already there.");
290 String new_url =
291 line.substring(line.indexOf("`") + 1,
292 line.lastIndexOf("'"));
293 // Remove the destination guff
294 if(destination != null) {
295 new_url = new_url.substring(destination.length());
296 }
297 addDownload("http:/" + new_url);
298 downloadWarning();
299 }
300 // Any other important message starts with the time in the form hh:mm:ss
301 else if(line.length() > 7) {
302 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
303 if(!ignore_for_robots) {
304 Gatherer.println("Error.");
305 downloadFailed();
306 }
307 else {
308 ignore_for_robots = false;
309 }
310 }
311 }
312 }
313 // Now display final message based on exit value
314 prcs.waitFor();
315 } catch (Exception ioe) {
316 //message(Utility.ERROR, ioe.toString());
317 Gatherer.printStackTrace(ioe);
318 }
319 // If we've got to here and the state isn't STOPPED then the
320 // job is complete.
321 if(state == Job.RUNNING) {
322 progress.mirrorComplete();
323 previous_state = state;
324 state = Job.COMPLETE;
325 }
326 }
327
328 /** The most important part of the Job class, this method is
329 * responsible for calling the WGet native methods used to
330 * mirror the indicated url. By this stage all the variables
331 * necessary should be set and we need only build up the
332 * parameter string and make the call.
333 */
334 public void callWGetNative() {
335 Vector args = new Vector();
336
337 // Let the GProgressBar know we're starting, just in case
338 // the user hasn't told us to. If this is the second time the
339 // urls downloaded and the first attempt was successful (ie
340 // the previous job was complete), then we have the case where
341 // the user is forcing us to remirror. Reset all the values etc
342 // if this is the case then reset the variables.
343 // Note that this can cause the result line to look something
344 // like this.
345 // Downloaded 12 of 12 files (8 warnings, 0 errors).
346 // The warnings would be something like, 'File already downloaded'
347 // but the total number of files and the file successfully
348 // downloaded will be correct.
349 if(previous_state == Job.COMPLETE) {
350 progress.mirrorBegun(true);
351 }
352 else {
353 progress.mirrorBegun(false);
354 }
355
356 // Parse arguments into array.
357 args.add(Utility.BASE_DIR + "wget");
358 //args.add("-k");
359 args.add("-d");
360 args.add("-o");
361 args.add("debug.txt");
362
363 if(destination != null) {
364 args.add("-P");
365 args.add(destination);
366 }
367
368 if(depth < 0) {
369 // Infinite recursion
370 args.add("-r");
371 }
372 else if (depth == 0) {
373 // Just this page.
374 }
375 else if (depth > 0) {
376 // Recursion to the specified depth.
377 args.add("-r");
378 args.add("-l");
379 args.add("" + depth + ""); // Hacky
380 }
381
382 if(!clobber || previous_state == PAUSED) {
383 args.add("-nc");
384 args.add("-c");
385 }
386
387 if(proxy_user != null) {
388 args.add("--proxy-user=" + proxy_user);
389 args.add("--proxy-passwd=" + proxy_pass);
390 }
391
392 if(page_requisites) {
393 args.add("-p");
394 }
395
396 if(quiet) {
397 args.add("-q");
398 }
399
400 if(other_hosts) {
401 args.add("-H");
402 }
403
404 args.add(initial.toString());
405
406 Gatherer.println("Calling wget ");
407 for(Enumeration e = args.elements(); e.hasMoreElements();) {
408 Gatherer.println(e.nextElement() + " ");
409 }
410 Gatherer.println("");
411
412 // Run home to mummy.
413 int value = mummy.wget(args.size(), args.toArray(), debug);
414
415 // If we've got to here and the state isn't STOPPED then the job is complete.
416 if(state == RUNNING) {
417 progress.mirrorComplete();
418 previous_state = state;
419 state = COMPLETE;
420 }
421 }
422
423 /** Called by the WGet native code when the current download is
424 * completed. In turn all download listeners are informed.
425 */
426 public void downloadComplete() {
427 progress.downloadComplete();
428 /* @todo
429 model.add(url.getURL(), destination);
430 */
431 url = null;
432 current_url = null;
433 }
434
435 /** Called by the WGet native code when the requested download returns
436 * a status code other than 200.
437 */
438 public void downloadFailed() {
439 ///ystem.out.println("downloadFailed("+current_url+")");
440 failed_urls.add(current_url); // Its the current url thats failed.
441 progress.downloadFailed();
442 }
443
444 /**
445 */
446 public void downloadWarning() {
447 progress.downloadWarning();
448 }
449
450 /**
451 * @return A String representing the currently downloading url.
452 */
453 public String getCurrent() {
454 return current_url;
455 }
456
457 /**
458 * @return A String representing the initial urls host (root node
459 * of tree that we are mirroring).
460 */
461 public String getHost() {
462 return url.getHost();
463 }
464
465 /**
466 * @return Returns the progress bar associated with this job.
467 */
468 public GProgressBar getProgressBar() {
469 return progress;
470 }
471
472 /** Called to discover if the user wanted this thread to run or if
473 * it is paused.
474 * @return An int representing the current Job state.
475 */
476 public int getState() {
477 return state;
478 }
479
480 /** Returns the current state of the stop flag for this job.
481 * @return A boolean representing whether the user has requested to
482 * stop.
483 */
484 public boolean hasSignalledStop() {
485 if(state == Job.STOPPED || state == Job.PAUSED ||
486 state == Job.COMPLETE) {
487 return true;
488 }
489 return false;
490 }
491
492 /** A convinence call.
493 * @return A String representing the url of the initial url (root node of the mirrored tree).
494 */
495 public String toString() {
496 return initial.toString();
497 }
498
499 /** Called by the WGet native code to signal the current progress of
500 * downloading.
501 * @param current A long representing the number of bytes that have
502 * been downloaded since the last update.
503 * @param expected A long representing the total number of bytes
504 * expected for this download.
505 */
506 public void updateProgress(long current, long expected) {
507 progress.updateProgress(current, expected);
508 }
509}
510
511
512
513
Note: See TracBrowser for help on using the repository browser.