source: main/trunk/greenstone3/src/java/org/greenstone/applet/GsdlCollageApplet/DownloadUrls.java@ 38968

Last change on this file since 38968 was 38968, checked in by anupama, 6 weeks ago

GsdlCollageApplet. 1. On stopRunning, DisplayImages may be looping in the graphics thread and in that case would through an exception when interrupted. I know check isStopped() within the loop to more tidily exit in such a case. 2. Finally remembered to handle the case of if there's no GS3 running server or no internet connection of any kind. It used to be the case that GsdlCollageApplet would display the Downloading message forever, noticeable when run as a commandline application and if I forgot to run the GS3 server. Now if downloading fails it will say so and come to halt instead of waiting for downloads and displaying the message Downloading forever. 3. Some useful local changes of debugging output on an exception in CURL.

File size: 22.8 KB
Line 
1package org.greenstone.applet.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8import javax.swing.ImageIcon; //****
9
10//import org.apache.log4j.*;
11
12/**
13 * @author Katrina Edgar
14 * @author David Bainbridge
15 *
16 * Controls retrieval of images from the specified starting url. Follows appropriate
17 * links from this starting point, traversing in a tree-like state through several other
18 * pages. Filters images and links based on specified parameters. Also controls the quantity
19 * of downloading that occurs by restricting the number of downloaded images that are yet to
20 * be displayed to 10, and the total number of downloads allowed is also restricted by
21 * the applet application (to prevent downloading occuring infinitely). */
22
23public class DownloadUrls extends Thread {
24
25 // for GS3
26 String baseURL = null;
27
28 /** Refers to applet */
29 GsdlCollageApplet app_ = null;
30 /** Refers to download thread */
31 DownloadImages download_images_ = null;
32
33 /** The address from which the application should start looking for images */
34 String starting_url_ = null;
35
36 /** the root directory of Greenstone*/
37 String document_root_ = null;
38
39 /** When this thread is asked to stop running, this variable will be set to true */
40 private boolean stop_running = false;
41
42 /** When this thread is asked to stop downloading, this variable will be set to true.
43 * For now the behaviour is the same as stop_running=true on this thread,
44 * but in case it changes in the future, we have a separate variable.
45 * Also, calling stopRunning() is not the same as setting stop_running = true, so
46 * to be careful, a separate variable for stop_downloading could be safer when coding.
47 */
48 private boolean stop_downloading = false;
49
50 /** Set to true when unable to download perhaps because of no internet connection. */
51 private boolean unable_to_download = false;
52
53 /** CHRIS - Holds the contents of the collection's assoc directory */
54 // File[] assocDir_ = null;
55
56 /** Restricts links followed from the starting url to links that contain this string */
57 String href_musthave_ = null;
58 /** Restricts links followed from the starting url to links that do not contain this string.
59 * Also prevents image names from containing this string */
60 String image_mustnothave_ = null;
61 /** Ignore images whose names begin with this string */
62 String image_ignore_ = null;
63 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
64 String image_type_ = null;
65
66 /** A static delay used when attempting to download more images into a full downloading buffer */
67 final int delay_ = 3000;
68 /** The maximum number of images to have downloaded and not yet displayed */
69 final int buffer_size_ = 1;
70
71 /** Used in cases where the image maps to a url outside of it's original location.
72 * When used with Greenstone the collage images will refer to documents in the collections
73 * from which the images are sourced. When used individually, the images may be saved into
74 * a user directory and the pages they reference may be external hyperlinks. */
75 Hashtable external_links_ = null;
76
77 /** Records all urls which have already been examined */
78 Hashtable visited_url_ = null;
79 /** Determines whether there are still pages to examine and images to download */
80 boolean thread_running_ = true;
81
82 int verbosity_ = 0;
83
84 /** Records all images which have already been examined */
85 Hashtable visited_images_ = null;
86
87 MediaTracker tracker;
88
89 /** Constructor to initialise a download thread from which images are found,
90 * saves parameters into local variables for use within the class.
91 *
92 * @param app reference to the applet
93 * @param download_images class which stores the images retrieved in triplets
94 * @param starting_url the url from which the search for images should begin
95 * @param href_musthave restricts links to only those containing this string
96 * @param image_mustnothave restricts links and image names to only those that don't contain this string
97 * @param image_ignore restricts the beginning of image names
98 * @param image_type restricts the type of images included in the collage to those named */
99 public DownloadUrls(GsdlCollageApplet app,
100 DownloadImages download_images, String starting_url,
101 String href_musthave, String image_mustnothave,
102 String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
103 {
104 super("DownloadUrls");
105 app_ = app;
106 download_images_ = download_images;
107
108 starting_url_ = starting_url;
109 href_musthave_ = href_musthave;
110 image_mustnothave_ = image_mustnothave;
111 image_ignore_ = image_ignore;
112 image_type_ = image_type;
113 document_root_ = document_root;
114 verbosity_ = verbosity;
115 tracker = trk;
116
117 System.err.println("starting_url_ " + starting_url +"\n"+
118 "href_musthave_ " + href_musthave +"\n"+
119 "image_mustnothave_ " + image_mustnothave+"\n"+
120 "image_ignore_ "+ image_ignore+"\n"+
121 "image_type_ "+ image_type+"\n"+
122 "document_root_ "+ document_root_
123 );
124
125
126
127
128 }
129
130 /** Determines whether or not a url has already been examined
131 *
132 * @param url_string the url to check
133 * @return true if the url has been visited, false if not */
134 public boolean already_visited(String url_string)
135 {
136 int hash_pos = url_string.indexOf("#");
137 if (hash_pos>0)
138 {
139 // strip off #anchor reference
140 url_string = url_string.substring(0,hash_pos);
141 }
142
143 // if the url has been visited before, return true
144 if (visited_url_.containsKey(url_string))
145 {
146 if (verbosity_ > 3)
147 {
148 System.err.println("Visited " + url_string + " before!");
149 }
150 return true;
151 }
152
153 visited_url_.put(url_string,"visited");
154
155 return false;
156 }
157
158 /** Determines whether or not an images or its screenview has been visited)
159 * has already been examined
160 *
161 * @param url_string the url to check
162 * @param img_name the image to check
163 * @return true if the url has been visited, false if not */
164 public boolean image_visited(String url_string, String img_name)
165 {
166 String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
167
168 if ( visited_images_.containsKey(hash_dir)){
169 Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
170
171 if (img_name.startsWith("screenview")){
172 return true;
173 }
174
175 if (hashed_images.containsKey(img_name)){
176 return true;
177 }
178
179 Enumeration enu = hashed_images.keys();
180 for(;enu.hasMoreElements();){
181 String name = (String)enu.nextElement();
182 if(name.startsWith("screenview")){
183 return true;
184 }
185 }
186
187 hashed_images.put(img_name,"visited");
188 }
189 else{
190 Hashtable hashed_images = new Hashtable();
191 hashed_images.put(img_name,"visited");
192 visited_images_.put(hash_dir,hashed_images);
193 }
194
195 return false;
196 }
197
198 // some other thread can call this method to tell this thread to stop running
199 public void stopRunning() {
200 if (verbosity_ >= 3) {
201 System.err.println("**** DownloadUrls.stopRunning() called");
202 }
203
204 stop_running = true;
205 // Interrupt this thread, even if it's not the one running
206 // Just want to make sure the DownloadURls' thread the CURL object runs in
207 // gets interrupted if it's what's currently running
208 if(!this.isInterrupted()) {
209 this.interrupt();
210 }
211 if(!Thread.currentThread().isInterrupted()) {
212 Thread.currentThread().interrupt();
213 }
214 }
215
216 public boolean isStopping() {
217 return stop_running;
218 }
219
220
221 /** Restricts the type of images that can be included in the collage
222 *
223 * @param url_string the url to check
224 * @return true if the image is of a specified type, false if not */
225 public boolean image_file_extension(String url_string)
226 {
227 // lower case comparisons
228 String url_lstring = url_string.toLowerCase();
229
230
231 // greenstone3 can add jsessionids at end, which messes up image file extension detection
232 int jsessionID_index = url_lstring.indexOf(";jsessionid=");
233 if(jsessionID_index >= 0) {
234 url_lstring = url_lstring.substring(0, jsessionID_index);
235 }
236
237 if (image_type_ == null)
238 return true;
239
240 String tmp = image_type_;
241 String original_image_type_ = image_type_;
242
243 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
244
245 tmp = image_type_.substring(0, image_type_.indexOf("%"));
246
247 if (image_type_.length() > image_type_.indexOf("%") + 1)
248 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
249 else
250 image_type_ = null;
251
252 if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
253 image_type_ = original_image_type_;
254 return true;
255 }
256 }
257
258 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
259 image_type_ = original_image_type_;
260 return true;
261 }
262
263 image_type_ = original_image_type_;
264 return false;
265 }
266
267 /** Restricts images to only those that satisfy several specified conditions
268 * regarding the content of the image name and url.
269 *
270 * @param url_string the url to check
271 * @return true if the image is satisfactory, false if not */
272 public boolean filter_image(String url_string)
273 {
274
275 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
276 {
277 if (!already_visited(url_string))
278 {
279 if (image_mustnothave_ != null) {
280
281 String tmp = image_mustnothave_;
282 String original_image_mustnothave_ = image_mustnothave_;
283
284 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
285
286 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
287 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
288 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
289 image_mustnothave_.length());
290 else
291 image_mustnothave_ = null;
292
293
294
295 if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
296
297 image_mustnothave_ = original_image_mustnothave_;
298 return false;
299 }
300 }
301
302 image_mustnothave_ = original_image_mustnothave_;
303
304 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
305 image_mustnothave_ = original_image_mustnothave_;
306 return false;
307 }
308
309 if (verbosity_ > 2) {
310 System.err.println("src url = "+ url_string);
311 }
312
313 image_mustnothave_ = original_image_mustnothave_;
314
315 }
316
317 } else { // already visited this image link
318 System.err.println("\t####" + url_string + " already visited - filter_image returning false");
319 // Isn't it that if we've already visited the image link once before,
320 // we've dealt with it anyway once before (in one way or another: decided it
321 // didn't pass the filter, or added the image for download if it did pass the
322 // filters ) so we don't process this image again again?
323 return false;
324 }
325
326 }
327
328 return true;
329 }
330
331 /** Restricts links to only those that satisfy several specified conditions
332 * regarding the address of the link.
333 *
334 * @param url_string the url to check
335 * @param new_url_string the url from which this link was found
336 * @param depth the number of links followed on this path
337 * @return true if the image is satisfactory, false if not */
338 public boolean filter_href(String url_string, String new_url_string, int depth)
339 {
340 boolean has_href = false;
341 String tmp = href_musthave_;
342 String original_href_musthave_ = href_musthave_;
343
344 // checks that it does contain this content
345 if (href_musthave_ != null) {
346
347 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
348
349 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
350 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
351 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
352 else
353 href_musthave_ = null;
354
355 if (url_string.indexOf(tmp) >= 0)
356 has_href = true;
357 }
358
359 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
360 has_href = true;
361
362 href_musthave_ = original_href_musthave_;
363 }
364
365 tmp = image_mustnothave_;
366 String original_image_mustnothave_ = image_mustnothave_;
367
368 // checks that it doesn't contain this content
369 if (image_mustnothave_ != null) {
370
371 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
372
373 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
374 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
375 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
376 else
377 image_mustnothave_ = null;
378
379 if (url_string.indexOf(tmp) >= 0)
380 has_href = false;
381 }
382 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
383 has_href = false;
384
385 image_mustnothave_ = original_image_mustnothave_;
386 }
387
388 // return true if the link is valid and false if not
389 if (href_musthave_==null || has_href)
390 {
391 // might be another URL
392 if (depth < app_.maxDepth())
393 {
394 if (!new_url_string.startsWith(url_string))
395 {
396 return true;
397 }
398 }
399 }
400 return false;
401 }
402
403 /** Adds an image to the stored downloaded images as a triplet.
404 * Ensures that the number of images downloaded but not displayed at
405 * any one time is controlled by using a buffer. If the buffer is
406 * full this function will wait until space becomes available before
407 * continuing. It also restricts the
408 * total number of images to download as specified by the applet.
409 *
410 * @param url the image to download
411 * @param from_url the url that this image was sourced from
412 * @param img_name the name of the image */
413 public void add_image(URL url, String from_url, String img_name)
414 {
415 // get the image from the url
416 if (verbosity_>=2) {
417 System.err.println(" Downloading image URL: " + url.toString());
418 }
419
420 if (image_visited(url.toString(),img_name)) return;
421
422 int size = download_images_.downloadImage(tracker,url, from_url, img_name);
423
424 try{
425 // if have completed the maximum number of downloads for the
426 // application then stop downloading
427 if (size == app_.maxDownloads()) {
428 // NOTE: the app can continue displaying images forever after download is
429 // finished, until interrupted/stopped.
430 // So don't set stop_running=false just because downloads have finished.
431 //stop_running = true; // Don't do this!
432 //thread_running = false;
433 //thread.currentThread().interrupt();
434
435 stop_downloading = true;
436 //stop(); // TODO, remove this, replacing with above
437
438 }
439
440 }
441 catch (Exception e) {
442 thread_running_ = false;
443 //stop(); // TODO
444 stop_downloading = true;
445 e.printStackTrace();
446 }
447 }
448
449 /** Connects to the starting url and looks for all images and links from this
450 * original page. Image links are processed first, so that any images found can be
451 * downloaded immediately and placed on the applet. Secondly, the links to other
452 * pages are recursively processed by this function and treated as a starting url
453 *
454 * @param new_url the url from which to start searching for images and links
455 * @param depth the number of links that have been followed on this path */
456 public void rec_add_images(String new_url, int depth)
457 {
458 // Check if the application's stopping, to end this recursive function as soon as possible
459 if(stop_running) {
460 return;
461 }
462
463 if (verbosity_ >= 2) {
464 System.err.println("*** Inspecting url: " + new_url);
465 }
466
467 if (already_visited(new_url)) return;
468
469 // check if there is a scenario where external hyperlinks are being used
470 externalLinks();
471 String img_name = new String();
472
473 // connect to the url
474 // stopRunning would have set the interrupted flag, and
475 // CURL checks for that in its loop, *outside* its potentially-blocking read() call
476 CURL curl = (app_.gsdlversion == 3) ? new CURL(new_url, app_.baseURL) : new CURL(new_url);
477
478 if (curl.connected_ok())
479 {
480 if (verbosity_ >= 1) {
481 System.err.print("Connected OK ... ");
482 }
483
484 // read the page
485 curl.readAll();
486 if (verbosity_ >= 1) {
487 System.err.println("URL read.");
488 }
489
490 // get all the <code><img src=</code> links into a vector
491 Vector src_links = curl.getSrcLinks();
492
493 if (verbosity_ >= 2) {
494 System.err.println(" Got src links... there are " + src_links.size() + " of them.");
495 }
496 // process each of the image links according to the parameters given.
497 for (int i = 0; i < src_links.size() && !stop_running && !stop_downloading; i++)
498 {
499 URL url = (URL)src_links.get(i);
500 String url_string = url.toString();
501
502 //System.err.println(" source links " + i + " [" + url_string +"]");
503
504 if (verbosity_ >= 4) {
505 System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
506 }
507
508 if (image_file_extension(url_string))
509 {
510 if (filter_image(url_string))
511 {
512 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
513
514 if (verbosity_ >= 2) {
515 System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
516 }
517
518 if ((external_links_ != null) && (!external_links_.isEmpty())) {
519 String ext = (String) external_links_.get(img_name);
520
521
522 if (ext != null){
523 add_image(url, ext, img_name);
524
525 }
526 else{
527
528 add_image(url, new_url, img_name);
529 }
530 }
531 else {
532
533 add_image(url, new_url, img_name);
534 }
535
536
537 }
538
539 }
540
541 }
542
543 if(stop_running && verbosity_ >= 3) {
544 System.err.println("*** DownloadUrls.rec_add_images() - Asked to stop running");
545 return;
546 }
547
548 // get all the <code><a href=</code> links into a vector
549 Vector href_links = curl.getHrefLinks();
550
551 if (verbosity_ >= 2) {
552 System.err.println(" Got href links... there are " + href_links.size() + " of them.");
553 }
554
555
556 // process each of the href links according to the parameters given.
557 for (int i = 0; i < href_links.size() && !stop_running && !stop_downloading; i++)
558 {
559
560 URL url = (URL)href_links.get(i);
561 String url_string = url.toString();
562 //System.err.println(" href links " + i + "[" + url_string +"]");
563
564 if (image_file_extension(url_string))
565 {
566
567 if (filter_image(url_string))
568
569 {
570
571 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
572 if (verbosity_ >= 2) {
573 System.err.println(" Filtered: href_link[" + i + "] = " + url_string);
574 }
575 if ((external_links_ != null) && (!external_links_.isEmpty())) {
576 String ext = (String) external_links_.get(img_name);
577
578 if (ext != null)
579 add_image(url, ext, img_name);
580 else
581 add_image(url, new_url, img_name);
582 }
583 else {
584 add_image(url, url_string, img_name);
585 }
586 }
587 }
588 else
589 {
590 if (filter_href(url_string,new_url,depth))
591 {
592 // If application has stopped, then don't do the
593 // recursive call, so we stop faster before exploring yet
594 // more links and deciding to stop then
595 if(stop_running) {
596 return;
597 }
598 rec_add_images(url_string,depth+1);
599
600 }
601 }
602 }
603 }
604
605 else {
606 System.err.println("Unable to download "+new_url);
607 unable_to_download = true;
608 }
609
610 if(stop_running && verbosity_ >= 3) {
611 System.err.println("*** DownloadUrls.rec_add_images() thread has been told to stop.");
612 }
613 }
614
615 public boolean wasUnableToDownload() { return unable_to_download; }
616
617
618 /** Used in cases where the image maps to a url outside of it's original location.
619 * When used with Greenstone the collage images will refer to documents in the collections
620 * from which the images are sourced. When used individually, the images may be saved into
621 * a user directory and the pages they reference may be external hyperlinks.
622 * This function reads that external links file and creates a hash map of the image to
623 * its external hyperlink. If the file does not exist the download thread will continue
624 * and assume the first case, that links are internal. */
625 public void externalLinks() {
626 external_links_ = null;
627 try {
628
629 if (starting_url_ == null || (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
630 if (verbosity_ >= 3) {
631 System.err.println("**** " + starting_url_ + " is not an external link.");
632 }
633 return;
634 }
635
636 // open a url to the file written
637 URL u = new URL(starting_url_ + "externallinks");
638
639 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
640
641 external_links_ = new Hashtable();
642
643 String l = r.readLine();
644 // split the line of the space, first part is the image, second part the link
645 while (l != null && !stop_running && !stop_downloading) {
646
647 String tmp1 = new String();
648 String tmp2 = new String();
649
650 if (l.indexOf(" ") >= 0) {
651
652 tmp1 = l.substring(0, l.indexOf(" "));
653 if (l.length() > l.indexOf(" ") + 1)
654 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
655 else
656 tmp2 = null;
657
658 if (tmp2 != null) {
659
660 external_links_.put(tmp1, tmp2);
661 //System.err.println(tmp1 + " " + tmp2);
662 }
663 }
664 l = r.readLine();
665 }
666
667 r.close();
668
669 if(stop_running && verbosity_ >= 3) {
670 System.err.println("*** DownloadUrls.externalLinks(): Asked to stop running");
671 }
672
673 } catch (Exception e) {
674 e.printStackTrace();
675 return;
676 }
677 }
678
679 /** Controls the download thread */
680 public void run ()
681 {
682 System.err.println("Starting download thread.");
683 visited_url_ = new Hashtable();
684 visited_images_ = new Hashtable();
685
686 rec_add_images(starting_url_,1);
687 download_images_.stopDownload();
688 System.err.println("DownloadUrls.run() - download thread finished.");
689 }
690}
Note: See TracBrowser for help on using the repository browser.