source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 6590

Last change on this file since 6590 was 6590, checked in by jmt12, 20 years ago

Started effecting the changes requested by Ian and David. So far I've removed the complex arguments, and have ensured that the path is correct for downloads without page requisites. I've also tried to get the workspace tree to update properly, and it is much closer than before but it is now temporarily displaying the same node twice.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Dictionary;
45import org.greenstone.gatherer.Gatherer;
46import org.greenstone.gatherer.WGet;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.file.FileSystemModel;
49import org.greenstone.gatherer.file.WorkspaceTreeModel;
50import org.greenstone.gatherer.gui.GProgressBar;
51import org.greenstone.gatherer.util.GURL;
52import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
53import org.greenstone.gatherer.util.Utility;
54/**
55 * @author John Thompson, Greenstone Digital Library, University of Waikato
56 * @version 2.0
57 */
58public class Job
59 implements ActionListener {
60
61 private boolean debug;
62 private boolean higher_directories;
63 private boolean no_parents;
64 private boolean other_hosts;
65 private boolean page_requisites;
66 private boolean quiet;
67
68 private GProgressBar progress;
69
70 private GURL initial = null;
71 private GURL url = null;
72
73 // private TreeModel model;
74
75 private int depth;
76 private int previous_state;
77 private int state;
78
79 private String current_url;
80 private String destination;
81 private String proxy_pass;
82 private String proxy_user;
83
84 private Vector encountered_urls;
85 private Vector failed_urls;
86
87 private WGet mummy;
88
89 public static int COMPLETE = 0;
90 public static int PAUSED = 1;
91 public static int RUNNING = 2;
92 public static int STOPPED = 3;
93
94 /**
95 */
96 public Job(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
97 // this.model = model;
98
99 this.debug = debug;
100 this.no_parents = no_parents;
101 this.other_hosts = other_hosts;
102 this.page_requisites = page_requisites;
103 this.quiet = quiet;
104 this.initial = new GURL(initial);
105 this.depth = depth;
106 this.destination = destination;
107 this.proxy_pass = proxy_pass;
108 this.proxy_user = proxy_user;
109 this.mummy = mummy;
110
111 progress = new GProgressBar(this, initial.toString(), simple);
112
113 encountered_urls = new Vector();
114 failed_urls = new Vector();
115
116 previous_state = STOPPED;
117 state = STOPPED;
118 }
119
120 /** Depending on which button on the progress bar was pushed,
121 * this method will affect the state of the Job and perhaps make
122 * calls to wget.class if necessary.
123 * @param event The ActionEvent fired from within the GProgressBar
124 * which we must respond to.
125 */
126 public void actionPerformed(ActionEvent event) {
127 // The action button is used to alternately start or stop the
128 // job. If the current state of the job is paused then this
129 // restart is logically equivelent to a resume.
130 if(event.getSource() == progress.action) {
131 previous_state = state;
132 if(state == RUNNING) {
133 state = PAUSED;
134 }
135 else {
136 state = RUNNING;
137 mummy.resumeThread();
138 }
139 }
140 else if (event.getSource() == progress.cancel) {
141 state = STOPPED; // Should already be stopped.
142 mummy.deleteJob(this);
143 }
144 }
145
146 /** Called by the WGet native code to inform us of a new download starting.
147 * @param url The url that is being downloaded, as a String.
148 */
149 public void addDownload(String raw_url) {
150 if(!encountered_urls.contains(raw_url)) {
151 encountered_urls.add(raw_url);
152 }
153 // Regardless create a new GURL
154 current_url = raw_url;
155 url = new GURL(raw_url);
156 progress.addDownload(raw_url);
157 }
158
159 /** Used to advise the Job of a newly parsed link. Its up to Job
160 * to decide if it already knows about this url, and if not to
161 * update its progress bar.
162 * @param url The url in question as a String.
163 * @param type Whether the link is an internal or external link.
164 * @return A boolean indicating if the url was added.
165 */
166 public boolean addLink(String raw_url, int type) {
167 ///ystem.out.println("addLink("+url+", "+type+")");
168 if(!encountered_urls.contains(raw_url)) {
169 // Add it to the urls we've seen.
170 encountered_urls.add(raw_url);
171 // Add it the to links for the current GURL.
172
173 // Add it to the progress file count.
174 progress.increaseFileCount();
175 return true;
176 }
177 // Regardless add it to the children links of the current GURL
178 initial.addLink(raw_url);
179
180 // We've seen it before. Don't count it again.
181 return false;
182 }
183
184 public void callWGet() {
185 // Build parameter string. Note that we never clobber, and we continue if possible
186 String command = Gatherer.config.getWGetPath() + " -nc -c ";
187
188 // Add the destination parameter
189 if(destination != null) {
190 command = command + "-P " + destination + " ";
191 }
192
193 if(depth < 0) {
194 // Infinite recursion
195 command = command + "-r ";
196 }
197 else if (depth == 0) {
198 // Just this page.
199 }
200 else if (depth > 0) {
201 // Recursion to the specified depth.
202 command = command + "-r -l" + depth + " ";
203 }
204
205 // Determine if we have to use a proxy.
206 if(Gatherer.config.get("general.use_proxy", true)) {
207 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
208 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
209 // Find out whether the user has already authenticated themselves
210 String user_pass = null;
211 String address = proxy_host + ":" + proxy_port;
212 int count = 0;
213 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
214 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
215 count++;
216 }
217 if(count >= 3) {
218 state = STOPPED;
219 return;
220 }
221 if(user_pass.indexOf("@") != -1) {
222 // Write the use proxy command
223 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
224
225 }
226 else {
227 Gatherer.println("Unknown user/pass");
228 }
229 }
230
231 // The user can either choose to mirror all of the page requisites...
232 if(page_requisites) {
233 command = command + "-p ";
234 }
235 // or not. In which case we ensure links are rewritten.
236 else {
237 }
238
239 if(other_hosts) {
240 command = command + "-H ";
241 }
242
243 // Finally tell it the site to download.
244 command = command + initial.toString();
245
246 if(previous_state == Job.COMPLETE) {
247 progress.mirrorBegun(true, true);
248 }
249 else {
250 progress.mirrorBegun(false, true);
251 }
252
253 // Run it
254 try {
255 Gatherer.println("Cmd: " + command);
256 Runtime rt = Runtime.getRuntime();
257 Process prcs = rt.exec(command);
258 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
259 BufferedReader br = new BufferedReader(isr);
260 // Capture the standard error stream and seach for two particular occurances.
261 String line;
262 boolean ignore_for_robots = false;
263 while ((line = br.readLine()) != null) {
264 Gatherer.println(line);
265
266 // The first magic special test is to see if we've just
267 // asked for the robots.txt file. If so we ignore
268 // the next add and then the next complete/error.
269 if(line.lastIndexOf("robots.txt;") != -1) {
270 Gatherer.println("***** Requesting robot.txt");
271 ignore_for_robots = true;
272 }
273 // If line contains "=> `" display text as the
274 // currently downloading url. Unique to add download.
275 else if(line.lastIndexOf("=> `") != -1) {
276 if(!ignore_for_robots) {
277 // Add download
278 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
279 // Remove the destination guff
280 if(destination != null) {
281 new_url = new_url.substring(destination.length());
282 }
283 addDownload("http:/" + new_url);
284 }
285 }
286 // If line contains "/s) - `" set currently
287 // downloading url to "Download Complete".
288 else if(line.lastIndexOf("/s) - `") != -1) {
289 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
290 if(!ignore_for_robots) {
291 System.err.println("Not ignore for robots");
292 // Download complete
293 downloadComplete(current_file_downloading);
294 }
295 else {
296 System.err.println("Ignore for robots");
297 ignore_for_robots = false;
298 }
299 }
300 // The already there line begins "File `..." However this
301 // is only true in english, so instead I looked and there
302 // are few (if any at all) other messages than those above
303 // and not overwriting messages that use " `" so we'll
304 // look for that. Note this method is not guarenteed to be
305 // unique like the previous two.
306 else if(line.lastIndexOf(" `") != -1) {
307 // Not Overwriting
308 Gatherer.println("Already there.");
309 String new_url =
310 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
311 // For some strange reason this won't compile
312 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
313 // symbol : class CAKE
314 // location: class org.greenstone.gatherer.collection.Job
315 /* ***********************************************************
316 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
317 *********************************************************** */
318 // Remove the destination guff
319 if(destination != null) {
320 new_url = new_url.substring(destination.length());
321 }
322 addDownload("http:/" + new_url);
323 downloadWarning();
324 }
325 // Any other important message starts with the time in the form hh:mm:ss
326 else if(line.length() > 7) {
327 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
328 if(!ignore_for_robots) {
329 Gatherer.println("Error.");
330 downloadFailed();
331 }
332 else {
333 ignore_for_robots = false;
334 }
335 }
336 }
337 }
338 // Now display final message based on exit value
339 prcs.waitFor();
340 }
341 catch (Exception ioe) {
342 //message(Utility.ERROR, ioe.toString());
343 Gatherer.printStackTrace(ioe);
344 }
345 // If we've got to here and the state isn't STOPPED then the
346 // job is complete.
347 if(state == Job.RUNNING) {
348 progress.mirrorComplete();
349 previous_state = state;
350 state = Job.COMPLETE;
351 }
352 }
353
354 /** The most important part of the Job class, this method is
355 * responsible for calling the WGet native methods used to
356 * mirror the indicated url. By this stage all the variables
357 * necessary should be set and we need only build up the
358 * parameter string and make the call.
359 */
360 public void callWGetNative() {
361 Vector args = new Vector();
362
363 // Let the GProgressBar know we're starting, just in case
364 // the user hasn't told us to. If this is the second time the
365 // urls downloaded and the first attempt was successful (ie
366 // the previous job was complete), then we have the case where
367 // the user is forcing us to remirror. Reset all the values etc
368 // if this is the case then reset the variables.
369 // Note that this can cause the result line to look something
370 // like this.
371 // Downloaded 12 of 12 files (8 warnings, 0 errors).
372 // The warnings would be something like, 'File already downloaded'
373 // but the total number of files and the file successfully
374 // downloaded will be correct.
375 if(previous_state == Job.COMPLETE) {
376 progress.mirrorBegun(true, false);
377 }
378 else {
379 progress.mirrorBegun(false, false);
380 }
381
382 // Parse arguments into array.
383 args.add(Utility.BASE_DIR + "wget");
384 //args.add("-k");
385 args.add("-d");
386 args.add("-o");
387 args.add("debug.txt");
388
389 if(destination != null) {
390 args.add("-P");
391 args.add(destination);
392 }
393
394 if(depth < 0) {
395 // Infinite recursion
396 args.add("-r");
397 }
398 else if (depth == 0) {
399 // Just this page.
400 }
401 else if (depth > 0) {
402 // Recursion to the specified depth.
403 args.add("-r");
404 args.add("-l");
405 args.add("" + depth + ""); // Hacky
406 }
407
408 if(previous_state == PAUSED) {
409 args.add("-nc");
410 args.add("-c");
411 }
412
413 if(proxy_user != null) {
414 args.add("--proxy-user=" + proxy_user);
415 args.add("--proxy-passwd=" + proxy_pass);
416 }
417
418 if(page_requisites) {
419 args.add("-p");
420 }
421
422 if(quiet) {
423 args.add("-q");
424 }
425
426 if(other_hosts) {
427 args.add("-H");
428 }
429
430 args.add(initial.toString());
431
432 Gatherer.println("Calling wget ");
433 for(Enumeration e = args.elements(); e.hasMoreElements();) {
434 Gatherer.println(e.nextElement() + " ");
435 }
436 Gatherer.println("");
437
438 // Run home to mummy.
439 int value = mummy.wget(args.size(), args.toArray(), debug);
440
441 // If we've got to here and the state isn't STOPPED then the job is complete.
442 if(state == RUNNING) {
443 progress.mirrorComplete();
444 previous_state = state;
445 state = COMPLETE;
446 }
447 }
448
449 /** Called by the WGet native code when the current download is
450 * completed. In turn all download listeners are informed.
451 */
452 public void downloadComplete() {
453 progress.downloadComplete();
454 url = null;
455 current_url = null;
456 }
457
458 public void downloadComplete(String current_file_downloading) {
459 progress.downloadComplete();
460 ///ystem.err.println("Current File: " + current_file_downloading);
461 //WorkspaceTreeModel.refreshWebCacheMappings();
462 if(Gatherer.g_man.collection_pane.workspace_tree != null) {
463 FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.collection_pane.workspace_tree.getModel();
464 File new_file = new File(current_file_downloading);
465 File parent_file = new_file.getParentFile();
466 String download_cache = Utility.getCacheDir().getAbsolutePath();
467 ArrayList raw_path = new ArrayList();
468 while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
469 raw_path.add(0, parent_file.getName());
470 parent_file = parent_file.getParentFile();
471 }
472 download_cache = null;
473 // Add download cache name
474 /** @todo - add to dictionary */
475 raw_path.add(0, "Mirror Cache");
476 // And the root node
477 raw_path.add(0, tree_model.getRoot());
478 TreePath destination_path = new TreePath(raw_path.toArray());
479 raw_path = null;
480 // Retrieve the destination node
481 FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
482 // destination_path = null;
483 FileNode new_file_node = new FileNode(new_file);
484 SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
485 System.err.println("Ready to insert new FileNode.");
486 System.err.println("Model: " + tree_model);
487 System.err.println("Destination path: " + destination_path);
488 System.err.println("Destination node: " + destination_node);
489 System.err.println("New node: " + new_file_node);
490
491 new_file_node = null;
492 destination_node = null;
493 tree_model = null;
494 }
495 url = null;
496 current_url = null;
497 }
498
499 /** Called by the WGet native code when the requested download returns
500 * a status code other than 200.
501 */
502 public void downloadFailed() {
503 ///ystem.out.println("downloadFailed("+current_url+")");
504 failed_urls.add(current_url); // Its the current url thats failed.
505 progress.downloadFailed();
506 }
507
508 /**
509 */
510 public void downloadWarning() {
511 progress.downloadWarning();
512 }
513
514 /**
515 * @return A String representing the currently downloading url.
516 */
517 /* private String getCurrent() {
518 return current_url;
519 } */
520
521 /**
522 * @return A String representing the initial urls host (root node
523 * of tree that we are mirroring).
524 */
525 public String getHost() {
526 return url.getHost();
527 }
528
529 /**
530 * @return Returns the progress bar associated with this job.
531 */
532 public GProgressBar getProgressBar() {
533 return progress;
534 }
535
536 /** Called to discover if the user wanted this thread to run or if
537 * it is paused.
538 * @return An int representing the current Job state.
539 */
540 public int getState() {
541 return state;
542 }
543
544 /** Returns the current state of the stop flag for this job.
545 * @return A boolean representing whether the user has requested to
546 * stop.
547 */
548 public boolean hasSignalledStop() {
549 if(state == Job.STOPPED || state == Job.PAUSED ||
550 state == Job.COMPLETE) {
551 return true;
552 }
553 return false;
554 }
555
556 /** A convinence call.
557 * @return A String representing the url of the initial url (root node of the mirrored tree).
558 */
559 public String toString() {
560 return initial.toString();
561 }
562
563 /** Called by the WGet native code to signal the current progress of
564 * downloading.
565 * @param current A long representing the number of bytes that have
566 * been downloaded since the last update.
567 * @param expected A long representing the total number of bytes
568 * expected for this download.
569 */
570 public void updateProgress(long current, long expected) {
571 progress.updateProgress(current, expected);
572 }
573}
Note: See TracBrowser for help on using the repository browser.