source: trunk/gli/src/org/greenstone/gatherer/collection/Job.java@ 7275

Last change on this file since 7275 was 6842, checked in by mdewsnip, 20 years ago

Variable names changes needed because of the name change of the Gather, Enrich and Design pane java files.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.collection;
38
39import java.awt.event.*;
40import java.io.*;
41import java.net.*;
42import java.util.*;
43import javax.swing.tree.*;
44import org.greenstone.gatherer.Dictionary;
45import org.greenstone.gatherer.Gatherer;
46import org.greenstone.gatherer.WGet;
47import org.greenstone.gatherer.file.FileNode;
48import org.greenstone.gatherer.file.FileSystemModel;
49import org.greenstone.gatherer.file.WorkspaceTreeModel;
50import org.greenstone.gatherer.gui.GProgressBar;
51import org.greenstone.gatherer.util.AppendLineOnlyFileDocument;
52import org.greenstone.gatherer.util.GURL;
53import org.greenstone.gatherer.util.SynchronizedTreeModelTools;
54import org.greenstone.gatherer.util.Utility;
55/**
56 * @author John Thompson, Greenstone Digital Library, University of Waikato
57 * @version 2.0
58 */
59public class Job
60 implements ActionListener {
61
62 private boolean debug;
63 private boolean higher_directories;
64 private boolean no_parents;
65 private boolean other_hosts;
66 private boolean page_requisites;
67 private boolean quiet;
68
69 private AppendLineOnlyFileDocument download_log;
70
71 private GProgressBar progress;
72
73 private GURL initial = null;
74 private GURL url = null;
75
76 // private TreeModel model;
77
78 private int depth;
79 private int previous_state;
80 private int state;
81
82 private String current_url;
83 private String destination;
84 private String proxy_pass;
85 private String proxy_user;
86
87 private Vector encountered_urls;
88 private Vector failed_urls;
89
90 private WGet mummy;
91
92 public static int COMPLETE = 0;
93 public static int PAUSED = 1;
94 public static int RUNNING = 2;
95 public static int STOPPED = 3;
96
97 /**
98 */
99 public Job(boolean debug, boolean no_parents, boolean other_hosts, boolean page_requisites, boolean quiet, URL initial, int depth, String destination, String proxy_pass, String proxy_user, WGet mummy, boolean simple) {
100 // this.model = model;
101
102 String log_filename = Utility.getLogDir(null) + "wget" + initial.hashCode() + ".log";
103 File log_file = new File(log_filename);
104 if(log_file.exists()) {
105 log_file.delete();
106 }
107 File parent_log_file = log_file.getParentFile();
108 parent_log_file.mkdirs();
109 parent_log_file = null;
110 log_file = null;
111
112 System.err.println("Creating the log file:" + log_filename);
113
114 this.debug = debug;
115 this.download_log = new AppendLineOnlyFileDocument(log_filename, false);
116 this.no_parents = no_parents;
117 this.other_hosts = other_hosts;
118 this.page_requisites = page_requisites;
119 this.quiet = quiet;
120 this.initial = new GURL(initial);
121 this.depth = depth;
122 this.destination = destination;
123 this.proxy_pass = proxy_pass;
124 this.proxy_user = proxy_user;
125 this.mummy = mummy;
126
127 progress = new GProgressBar(this, initial.toString(), simple);
128
129 encountered_urls = new Vector();
130 failed_urls = new Vector();
131
132 previous_state = STOPPED;
133 state = STOPPED;
134 }
135
136 /** Depending on which button on the progress bar was pushed,
137 * this method will affect the state of the Job and perhaps make
138 * calls to wget.class if necessary.
139 * @param event The ActionEvent fired from within the GProgressBar
140 * which we must respond to.
141 */
142 public void actionPerformed(ActionEvent event) {
143 // The action button is used to alternately start or stop the
144 // job. If the current state of the job is paused then this
145 // restart is logically equivelent to a resume.
146 if(event.getSource() == progress.action) {
147 previous_state = state;
148 state = RUNNING;
149 mummy.resumeThread();
150 }
151 else if (event.getSource() == progress.cancel) {
152 if(state == RUNNING) {
153 previous_state = state;
154 state = STOPPED; // Should already be stopped.
155 }
156 else {
157 mummy.deleteJob(this);
158 }
159 }
160 }
161
162 /** Called by the WGet native code to inform us of a new download starting.
163 * @param raw_url The url that is being downloaded, as a String.
164 */
165 public void addDownload(String raw_url) {
166 if(!encountered_urls.contains(raw_url)) {
167 encountered_urls.add(raw_url);
168 }
169 // Regardless create a new GURL
170 current_url = raw_url;
171 url = new GURL(raw_url);
172 progress.addDownload(raw_url);
173 }
174
175 /** Used to advise the Job of a newly parsed link. Its up to Job
176 * to decide if it already knows about this url, and if not to
177 * update its progress bar.
178 * @param raw_url The url in question as a String.
179 * @param type Whether the link is an internal or external link.
180 * @return A boolean indicating if the url was added.
181 */
182 public boolean addLink(String raw_url, int type) {
183 ///ystem.out.println("addLink("+url+", "+type+")");
184 if(!encountered_urls.contains(raw_url)) {
185 // Add it to the urls we've seen.
186 encountered_urls.add(raw_url);
187 // Add it the to links for the current GURL.
188
189 // Add it to the progress file count.
190 progress.increaseFileCount();
191 return true;
192 }
193 // Regardless add it to the children links of the current GURL
194 initial.addLink(raw_url);
195
196 // We've seen it before. Don't count it again.
197 return false;
198 }
199
200 public void callWGet() {
201 // Build parameter string. Note that we never clobber, and we continue if possible
202 String command = Gatherer.config.getWGetPath() + " -nc -c ";
203
204 // Add the destination parameter
205 if(destination != null) {
206 command = command + "-P " + destination + " ";
207 }
208
209 if(depth < 0) {
210 // Infinite recursion
211 command = command + "-r ";
212 }
213 else if (depth == 0) {
214 // Just this page.
215 }
216 else if (depth > 0) {
217 // Recursion to the specified depth.
218 command = command + "-r -l" + depth + " ";
219 }
220
221 // Determine if we have to use a proxy.
222 if(Gatherer.config.get("general.use_proxy", true)) {
223 String proxy_host = Gatherer.config.getString("general.proxy_host", true);
224 String proxy_port = Gatherer.config.getString("general.proxy_port", true);
225 // Find out whether the user has already authenticated themselves
226 String user_pass = null;
227 String address = proxy_host + ":" + proxy_port;
228 int count = 0;
229 while(count < 3 && (user_pass = (String) Gatherer.authentications.get(address)) == null) {
230 Authenticator.requestPasswordAuthentication(proxy_host, null, Integer.parseInt(proxy_port), "http://", Dictionary.get("WGet.Prompt"), "HTTP");
231 count++;
232 }
233 if(count >= 3) {
234 state = STOPPED;
235 return;
236 }
237 if(user_pass.indexOf("@") != -1) {
238 // Write the use proxy command
239 command = command + "-e httpproxy=" + proxy_host + ":" + proxy_port + "/ --proxy-user=" + user_pass.substring(0, user_pass.indexOf("@")) + " --proxy-passwd=" + user_pass.substring(user_pass.indexOf("@") + 1) + " -Y on ";
240
241 }
242 else {
243 Gatherer.println("Unknown user/pass");
244 }
245 }
246
247 // The user can either choose to mirror all of the page requisites...
248 if(page_requisites) {
249 command = command + "-p ";
250 }
251 // or not. In which case we ensure links are rewritten.
252 else {
253 command = command + "-k ";
254 }
255
256 if(other_hosts) {
257 command = command + "-H ";
258 }
259
260 // Finally tell it the site to download.
261 command = command + initial.toString();
262
263 if(previous_state == Job.COMPLETE) {
264 progress.mirrorBegun(true, true);
265 }
266 else {
267 progress.mirrorBegun(false, true);
268 }
269
270 // Run it
271 try {
272 Gatherer.println("Cmd: " + command);
273 Runtime rt = Runtime.getRuntime();
274 Process prcs = rt.exec(command);
275 InputStreamReader isr = new InputStreamReader(prcs.getErrorStream());
276 BufferedReader br = new BufferedReader(isr);
277 // Capture the standard error stream and seach for two particular occurances.
278 String line;
279 boolean ignore_for_robots = false;
280 while ((line = br.readLine()) != null && state != STOPPED) {
281 Gatherer.println(line);
282 download_log.appendLine(line);
283 // The first magic special test is to see if we've just
284 // asked for the robots.txt file. If so we ignore
285 // the next add and then the next complete/error.
286 if(line.lastIndexOf("robots.txt;") != -1) {
287 Gatherer.println("***** Requesting robot.txt");
288 ignore_for_robots = true;
289 }
290 // If line contains "=> `" display text as the
291 // currently downloading url. Unique to add download.
292 else if(line.lastIndexOf("=> `") != -1) {
293 if(!ignore_for_robots) {
294 // Add download
295 String new_url = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
296 // Remove the destination guff
297 if(destination != null) {
298 new_url = new_url.substring(destination.length());
299 }
300 addDownload("http:/" + new_url);
301 }
302 }
303 // If line contains "/s) - `" set currently
304 // downloading url to "Download Complete".
305 else if(line.lastIndexOf("/s) - `") != -1) {
306 String current_file_downloading = line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
307 if(!ignore_for_robots) {
308 Gatherer.println("Not ignore for robots");
309 // Download complete
310 downloadComplete(current_file_downloading);
311 }
312 else {
313 Gatherer.println("Ignore for robots");
314 ignore_for_robots = false;
315 }
316 }
317 // The already there line begins "File `..." However this
318 // is only true in english, so instead I looked and there
319 // are few (if any at all) other messages than those above
320 // and not overwriting messages that use " `" so we'll
321 // look for that. Note this method is not guarenteed to be
322 // unique like the previous two.
323 else if(line.lastIndexOf(" `") != -1) {
324 // Not Overwriting
325 Gatherer.println("Already there.");
326 String new_url =
327 line.substring(line.indexOf("`") + 1, line.lastIndexOf("'"));
328 // For some strange reason this won't compile
329 // src/org/greenstone/gatherer/collection/Job.java:311: cannot resolve symbol
330 // symbol : class CAKE
331 // location: class org.greenstone.gatherer.collection.Job
332 /* ***********************************************************
333 CAKE CAKE CAKE CAKE I WANT CAKE GIVE ME CAKE NOW!!
334 *********************************************************** */
335 // Remove the destination guff
336 if(destination != null) {
337 new_url = new_url.substring(destination.length());
338 }
339 addDownload("http:/" + new_url);
340 downloadWarning();
341 }
342 // Any other important message starts with the time in the form hh:mm:ss
343 else if(line.length() > 7) {
344 if(line.charAt(2) == ':' && line.charAt(5) == ':') {
345 if(!ignore_for_robots) {
346 Gatherer.println("Error.");
347 downloadFailed();
348 }
349 else {
350 ignore_for_robots = false;
351 }
352 }
353 }
354 }
355 if(state == STOPPED) {
356 isr.close();
357 prcs.destroy(); // This doesn't always work, but it's worth a try
358 }
359 else {
360 // Now display final message based on exit value
361 prcs.waitFor();
362 }
363 }
364 catch (Exception ioe) {
365 //message(Utility.ERROR, ioe.toString());
366 Gatherer.printStackTrace(ioe);
367 }
368 // If we've got to here and the state isn't STOPPED then the
369 // job is complete.
370 if(state == Job.RUNNING) {
371 progress.mirrorComplete();
372 previous_state = state;
373 state = Job.COMPLETE;
374 }
375 }
376
377 /** The most important part of the Job class, this method is
378 * responsible for calling the WGet native methods used to
379 * mirror the indicated url. By this stage all the variables
380 * necessary should be set and we need only build up the
381 * parameter string and make the call.
382 */
383 public void callWGetNative() {
384 Vector args = new Vector();
385
386 // Let the GProgressBar know we're starting, just in case
387 // the user hasn't told us to. If this is the second time the
388 // urls downloaded and the first attempt was successful (ie
389 // the previous job was complete), then we have the case where
390 // the user is forcing us to remirror. Reset all the values etc
391 // if this is the case then reset the variables.
392 // Note that this can cause the result line to look something
393 // like this.
394 // Downloaded 12 of 12 files (8 warnings, 0 errors).
395 // The warnings would be something like, 'File already downloaded'
396 // but the total number of files and the file successfully
397 // downloaded will be correct.
398 if(previous_state == Job.COMPLETE) {
399 progress.mirrorBegun(true, false);
400 }
401 else {
402 progress.mirrorBegun(false, false);
403 }
404
405 // Parse arguments into array.
406 args.add(Utility.BASE_DIR + "wget");
407 args.add("-d");
408 args.add("-o");
409 args.add("debug.txt");
410
411 if(destination != null) {
412 args.add("-P");
413 args.add(destination);
414 }
415
416 if(depth < 0) {
417 // Infinite recursion
418 args.add("-r");
419 }
420 else if (depth == 0) {
421 // Just this page.
422 }
423 else if (depth > 0) {
424 // Recursion to the specified depth.
425 args.add("-r");
426 args.add("-l");
427 args.add("" + depth + ""); // Hacky
428 }
429
430 if(previous_state == PAUSED) {
431 args.add("-nc");
432 args.add("-c");
433 }
434
435 if(proxy_user != null) {
436 args.add("--proxy-user=" + proxy_user);
437 args.add("--proxy-passwd=" + proxy_pass);
438 }
439
440 if(page_requisites) {
441 args.add("-p");
442 }
443
444 if(quiet) {
445 args.add("-q");
446 }
447
448 if(other_hosts) {
449 args.add("-H");
450 }
451
452 args.add(initial.toString());
453
454 Gatherer.println("Calling wget ");
455 for(Enumeration e = args.elements(); e.hasMoreElements();) {
456 Gatherer.println(e.nextElement() + " ");
457 }
458 Gatherer.println("");
459
460 // Run home to mummy.
461 int value = mummy.wget(args.size(), args.toArray(), debug);
462
463 // If we've got to here and the state isn't STOPPED then the job is complete.
464 if(state == RUNNING) {
465 progress.mirrorComplete();
466 previous_state = state;
467 state = COMPLETE;
468 }
469 }
470
471 /** Called by the WGet native code when the current download is
472 * completed. In turn all download listeners are informed.
473 */
474 public void downloadComplete() {
475 progress.downloadComplete();
476 url = null;
477 current_url = null;
478 }
479
480 public void downloadComplete(String current_file_downloading) {
481 progress.downloadComplete();
482 Gatherer.println("Current File: " + current_file_downloading);
483 //WorkspaceTreeModel.refreshWebCacheMappings();
484 if(Gatherer.g_man.gather_pane.workspace_tree != null) {
485 FileSystemModel tree_model = (FileSystemModel) Gatherer.g_man.gather_pane.workspace_tree.getModel();
486 File new_file = new File(current_file_downloading);
487 File parent_file = new_file.getParentFile();
488 String download_cache = Utility.getCacheDir().getAbsolutePath();
489 ArrayList raw_path = new ArrayList();
490 while(parent_file != null && !parent_file.getAbsolutePath().equals(download_cache)) {
491 raw_path.add(0, parent_file.getName());
492 parent_file = parent_file.getParentFile();
493 }
494 download_cache = null;
495 // Add download cache name
496 /** @todo - add to dictionary */
497 raw_path.add(0, "Mirroring.Mirror_Cache");
498 // And the root node
499 raw_path.add(0, tree_model.getRoot());
500 TreePath destination_path = new TreePath(raw_path.toArray());
501 raw_path = null;
502 // Retrieve the destination node
503 FileNode destination_node = (FileNode) tree_model.getNode(destination_path);
504 // destination_path = null;
505 //FileNode new_file_node = new FileNode(new_file);
506
507 // It suddenly occurs to me that by retrieving the destination path, we are causing the potential destination node to map its children which includes the file which I am about to add. Hence I was ending up with two copies.
508 ///atherer.println("Ready to insert new FileNode.");
509 Gatherer.println("Model: " + tree_model);
510 Gatherer.println("Destination path: " + destination_path);
511 destination_node.unmap();
512 ///atherer.println("Destination node: " + destination_node);
513 ///atherer.println("New node: " + new_file_node);
514 //SynchronizedTreeModelTools.insertNodeInto(tree_model, destination_node, new_file_node);
515
516 //new_file_node = null;
517 destination_node = null;
518 tree_model = null;
519 }
520 url = null;
521 current_url = null;
522 }
523
524 /** Called by the WGet native code when the requested download returns
525 * a status code other than 200.
526 */
527 public void downloadFailed() {
528 ///ystem.out.println("downloadFailed("+current_url+")");
529 failed_urls.add(current_url); // Its the current url thats failed.
530 progress.downloadFailed();
531 }
532
533 /**
534 */
535 public void downloadWarning() {
536 progress.downloadWarning();
537 }
538
539 /**
540 * @return A String representing the currently downloading url.
541 */
542 /* private String getCurrent() {
543 return current_url;
544 } */
545
546 /**
547 * @return A String representing the initial urls host (root node
548 * of tree that we are mirroring).
549 */
550 public String getHost() {
551 return url.getHost();
552 }
553
554 public AppendLineOnlyFileDocument getLogDocument() {
555 return download_log;
556 }
557
558 /**
559 * @return Returns the progress bar associated with this job.
560 */
561 public GProgressBar getProgressBar() {
562 return progress;
563 }
564
565 /** Called to discover if the user wanted this thread to run or if
566 * it is paused.
567 * @return An int representing the current Job state.
568 */
569 public int getState() {
570 return state;
571 }
572
573 /** Returns the current state of the stop flag for this job.
574 * @return A boolean representing whether the user has requested to
575 * stop.
576 */
577 public boolean hasSignalledStop() {
578 if(state == Job.STOPPED || state == Job.PAUSED ||
579 state == Job.COMPLETE) {
580 return true;
581 }
582 return false;
583 }
584
585 public void setState(int state) {
586 previous_state = this.state;
587 this.state = state;
588 }
589
590 /** A convinence call.
591 * @return A String representing the url of the initial url (root node of the mirrored tree).
592 */
593 public String toString() {
594 return initial.toString();
595 }
596
597 /** Called by the WGet native code to signal the current progress of
598 * downloading.
599 * @param current A long representing the number of bytes that have
600 * been downloaded since the last update.
601 * @param expected A long representing the total number of bytes
602 * expected for this download.
603 */
604 public void updateProgress(long current, long expected) {
605 progress.updateProgress(current, expected);
606 }
607}
Note: See TracBrowser for help on using the repository browser.