source: main/trunk/greenstone3/src/java/org/greenstone/applet/GsdlCollageApplet/DownloadUrls.java@ 38871

Last change on this file since 38871 was 38871, checked in by anupama, 3 months ago

I've got the CURL and DownloadURLs methods now finding the right image URLs and suffixing them to the correct baseURL. I'm still hardcoding some applet params, including new ones, into the java code to get it this far. However, I'm now hitting a security exception when it tries to download the first correct image whose URL it works out, now it's working those out properly. Some googling seemed to indicate that the applet needs to be signed or not be running in a sandbox. I'm wondering if the appletviewer, not being the server, doesn't allow the applet to not access images on a distinct server URL and that if I could get the applet running as webswing on the server itself, it would have the right permissions to access/download the images. That will be the next step. If that doesn't work, I will need to first try to rewrite this JApplet as an application and see if that change makes a difference.

File size: 20.0 KB
Line 
1package org.greenstone.applet.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8import javax.swing.ImageIcon; //****
9
10
11/**
12 * @author Katrina Edgar
13 * @author David Bainbridge
14 *
15 * Controls retrieval of images from the specified starting url. Follows appropriate
16 * links from this starting point, traversing in a tree-like state through several other
17 * pages. Filters images and links based on specified parameters. Also controls the quantity
18 * of downloading that occurs by restricting the number of downloaded images that are yet to
19 * be displayed to 10, and the total number of downloads allowed is also restricted by
20 * the applet application (to prevent downloading occuring infinitely). */
21
22public class DownloadUrls extends Thread {
23 // for GS3
24 String gs3CollImgPath = null;
25 String baseURL = null;
26
27 /** Refers to applet */
28 GsdlCollageApplet app_ = null;
29 /** Refers to download thread */
30 DownloadImages download_images_ = null;
31
32 /** The address from which the application should start looking for images */
33 String starting_url_ = null;
34
35 /** the root directory of Greenstone*/
36 String document_root_ = null;
37
38
39 /** CHRIS - Holds the contents of the collection's assoc directory */
40 // File[] assocDir_ = null;
41
42 /** Restricts links followed from the starting url to links that contain this string */
43 String href_musthave_ = null;
44 /** Restricts links followed from the starting url to links that do not contain this string.
45 * Also prevents image names from containing this string */
46 String image_mustnothave_ = null;
47 /** Ignore images whose names begin with this string */
48 String image_ignore_ = null;
49 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
50 String image_type_ = null;
51
52 /** A static delay used when attempting to download more images into a full downloading buffer */
53 final int delay_ = 3000;
54 /** The maximum number of images to have downloaded and not yet displayed */
55 final int buffer_size_ = 1;
56
57 /** Used in cases where the image maps to a url outside of it's original location.
58 * When used with Greenstone the collage images will refer to documents in the collections
59 * from which the images are sourced. When used individually, the images may be saved into
60 * a user directory and the pages they reference may be external hyperlinks. */
61 Hashtable external_links_ = null;
62
63 /** Records all urls which have already been examined */
64 Hashtable visited_url_ = null;
65 /** Determines whether there are still pages to examine and images to download */
66 boolean thread_running_ = true;
67
68 int verbosity_ = 0;
69
70 /** Records all images which have already been examined */
71 Hashtable visited_images_ = null;
72
73 MediaTracker tracker;
74
75 /** Constructor to initialise a download thread from which images are found,
76 * saves parameters into local variables for use within the class.
77 *
78 * @param app reference to the applet
79 * @param download_images class which stores the images retrieved in triplets
80 * @param starting_url the url from which the search for images should begin
81 * @param href_musthave restricts links to only those containing this string
82 * @param image_mustnothave restricts links and image names to only those that don't contain this string
83 * @param image_ignore restricts the beginning of image names
84 * @param image_type restricts the type of images included in the collage to those named */
85 public DownloadUrls(GsdlCollageApplet app,
86 DownloadImages download_images, String starting_url,
87 String href_musthave, String image_mustnothave,
88 String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
89 {
90 super("DownloadUrls");
91 app_ = app;
92 download_images_ = download_images;
93
94 starting_url_ = starting_url;
95 href_musthave_ = href_musthave;
96 image_mustnothave_ = image_mustnothave;
97 image_ignore_ = image_ignore;
98 image_type_ = image_type;
99 document_root_ = document_root;
100 verbosity_ = verbosity;
101 tracker = trk;
102
103 System.err.println("starting_url_ " + starting_url +"\n"+
104 "href_musthave_ " + href_musthave +"\n"+
105 "image_mustnothave_ " + image_mustnothave+"\n"+
106 "image_ignore_ "+ image_ignore+"\n"+
107 "image_type_ "+ image_type+"\n"+
108 "document root "+ document_root_
109 );
110
111
112
113
114 }
115
116 public void setupForGS3(String gs3CollImgPath, String baseURL)
117 {
118 this.gs3CollImgPath = gs3CollImgPath;
119 this.baseURL = baseURL;
120 }
121
122 /** Determines whether or not a url has already been examined
123 *
124 * @param url_string the url to check
125 * @return true if the url has been visited, false if not */
126 public boolean already_visited(String url_string)
127 {
128 int hash_pos = url_string.indexOf("#");
129 if (hash_pos>0)
130 {
131 // strip off #anchor reference
132 url_string = url_string.substring(0,hash_pos);
133 }
134
135 // if the url has been visited before, return true
136 if (visited_url_.containsKey(url_string))
137 {
138 if (verbosity_ > 3)
139 {
140 System.err.println("Visited " + url_string + " before!");
141 }
142 return true;
143 }
144
145 visited_url_.put(url_string,"visited");
146
147 return false;
148 }
149
150 /** Determines whether or not an images or its screenview has been visited)
151 * has already been examined
152 *
153 * @param url_string the url to check
154 * @param img_name the image to check
155 * @return true if the url has been visited, false if not */
156 public boolean image_visited(String url_string, String img_name)
157 {
158 String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
159
160 if ( visited_images_.containsKey(hash_dir)){
161 Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
162
163 if (img_name.startsWith("screenview")){
164 return true;
165 }
166
167 if (hashed_images.containsKey(img_name)){
168 return true;
169 }
170
171 Enumeration enu = hashed_images.keys();
172 for(;enu.hasMoreElements();){
173 String name = (String)enu.nextElement();
174 if(name.startsWith("screenview")){
175 return true;
176 }
177 }
178
179 hashed_images.put(img_name,"visited");
180 }
181 else{
182 Hashtable hashed_images = new Hashtable();
183 hashed_images.put(img_name,"visited");
184 visited_images_.put(hash_dir,hashed_images);
185 }
186
187 return false;
188 }
189
190
191
192
193
194 /** Restricts the type of images that can be included in the collage
195 *
196 * @param url_string the url to check
197 * @return true if the image is of a specified type, false if not */
198 public boolean image_file_extension(String url_string)
199 {
200 // lower case comparisons
201 String url_lstring = url_string.toLowerCase();
202
203
204 // greenstone3 can add jsessionids at end, which messes up image file extension detection
205 int jsessionID_index = url_lstring.indexOf(";jsessionid=");
206 if(jsessionID_index >= 0) {
207 url_lstring = url_lstring.substring(0, jsessionID_index);
208 }
209
210 if (image_type_ == null)
211 return true;
212
213 String tmp = image_type_;
214 String original_image_type_ = image_type_;
215
216 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
217
218 tmp = image_type_.substring(0, image_type_.indexOf("%"));
219
220 if (image_type_.length() > image_type_.indexOf("%") + 1)
221 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
222 else
223 image_type_ = null;
224
225 if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
226 image_type_ = original_image_type_;
227 return true;
228 }
229 }
230
231 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
232 image_type_ = original_image_type_;
233 return true;
234 }
235
236 image_type_ = original_image_type_;
237 return false;
238 }
239
240 /** Restricts images to only those that satisfy several specified conditions
241 * regarding the content of the image name and url.
242 *
243 * @param url_string the url to check
244 * @return true if the image is satisfactory, false if not */
245 public boolean filter_image(String url_string)
246 {
247
248 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
249 {
250 if (!already_visited(url_string))
251 {
252 if (image_mustnothave_ != null) {
253
254 String tmp = image_mustnothave_;
255 String original_image_mustnothave_ = image_mustnothave_;
256
257 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
258
259 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
260 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
261 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
262 image_mustnothave_.length());
263 else
264 image_mustnothave_ = null;
265
266
267
268 if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
269
270 image_mustnothave_ = original_image_mustnothave_;
271 return false;
272 }
273 }
274
275 image_mustnothave_ = original_image_mustnothave_;
276
277 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
278 image_mustnothave_ = original_image_mustnothave_;
279 return false;
280 }
281
282 if (verbosity_ > 2) {
283 System.err.println("src url = "+ url_string);
284 }
285
286 image_mustnothave_ = original_image_mustnothave_;
287
288 }
289
290 } else { // already visited this image link
291 System.err.println("\t####" + url_string + " already visited - filter_image returning false");
292 // Isn't it that if we've already visited the image link once before,
293 // we've dealt with it anyway once before (in one way or another: decided it
294 // didn't pass the filter, or added the image for download if it did pass the
295 // filters ) so we don't process this image again again?
296 return false;
297 }
298
299 }
300
301 return true;
302 }
303
304 /** Restricts links to only those that satisfy several specified conditions
305 * regarding the address of the link.
306 *
307 * @param url_string the url to check
308 * @param new_url_string the url from which this link was found
309 * @param depth the number of links followed on this path
310 * @return true if the image is satisfactory, false if not */
311 public boolean filter_href(String url_string, String new_url_string, int depth)
312 {
313 boolean has_href = false;
314 String tmp = href_musthave_;
315 String original_href_musthave_ = href_musthave_;
316
317 // checks that it does contain this content
318 if (href_musthave_ != null) {
319
320 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
321
322 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
323 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
324 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
325 else
326 href_musthave_ = null;
327
328 if (url_string.indexOf(tmp) >= 0)
329 has_href = true;
330 }
331
332 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
333 has_href = true;
334
335 href_musthave_ = original_href_musthave_;
336 }
337
338 tmp = image_mustnothave_;
339 String original_image_mustnothave_ = image_mustnothave_;
340
341 // checks that it doesn't contain this content
342 if (image_mustnothave_ != null) {
343
344 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
345
346 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
347 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
348 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
349 else
350 image_mustnothave_ = null;
351
352 if (url_string.indexOf(tmp) >= 0)
353 has_href = false;
354 }
355 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
356 has_href = false;
357
358 image_mustnothave_ = original_image_mustnothave_;
359 }
360
361 // return true if the link is valid and false if not
362 if (href_musthave_==null || has_href)
363 {
364 // might be another URL
365 if (depth < app_.maxDepth())
366 {
367 if (!new_url_string.startsWith(url_string))
368 {
369 return true;
370 }
371 }
372 }
373 return false;
374 }
375
376 /** Adds an image to the stored downloaded images as a triplet.
377 * Ensures that the number of images downloaded but not displayed at
378 * any one time is controlled by using a buffer. If the buffer is
379 * full this function will wait until space becomes available before
380 * continuing. It also restricts the
381 * total number of images to download as specified by the applet.
382 *
383 * @param url the image to download
384 * @param from_url the url that this image was sourced from
385 * @param img_name the name of the image */
386 public void add_image(URL url, String from_url, String img_name)
387 {
388 // get the image from the url
389 if (verbosity_>=2) {
390 System.err.println(" Downloading image URL: " + url.toString());
391 }
392
393 if (image_visited(url.toString(),img_name)) return;
394
395 int size = download_images_.downloadImage(tracker,url, from_url, img_name);
396
397 try{
398 // if have completed the maximum number of downloads for the
399 // application then stop
400 if (size == app_.maxDownloads()) {
401 stop();
402 }
403
404 }
405 catch (Exception e) {
406 thread_running_ = false;
407 stop();
408 e.printStackTrace();
409 }
410 }
411
412 /** Connects to the starting url and looks for all images and links from this
413 * original page. Image links are processed first, so that any images found can be
414 * downloaded immediately and placed on the applet. Secondly, the links to other
415 * pages are recursively processed by this function and treated as a starting url
416 *
417 * @param new_url the url from which to start searching for images and links
418 * @param depth the number of links that have been followed on this path */
419 public void rec_add_images(String new_url, int depth)
420 {
421
422 if (verbosity_ >= 2) {
423 System.err.println("*** Inspecting url: " + new_url);
424 }
425
426 if (already_visited(new_url)) return;
427
428 // check if there is a scenario where external hyperlinks are being used
429 externalLinks();
430 String img_name = new String();
431
432 // connect to the url
433 CURL curl = (app_.gsdlversion == 3) ? new CURL(new_url, this.baseURL) : new CURL(new_url);
434
435 if (curl.connected_ok())
436 {
437 if (verbosity_ >= 1) {
438 System.err.print("Connected OK ... ");
439 }
440
441 // read the page
442 curl.readAll();
443 if (verbosity_ >= 1) {
444 System.err.println("URL read.");
445 }
446
447 // get all the <code><img src=</code> links into a vector
448 Vector src_links = curl.getSrcLinks();
449
450 if (verbosity_ >= 2) {
451 System.err.println(" Got src links... there are " + src_links.size() + " of them.");
452 }
453 // process each of the image links according to the parameters given.
454 for (int i = 0; i < src_links.size(); i++)
455 {
456 URL url = (URL)src_links.get(i);
457 String url_string = url.toString();
458
459 //System.err.println(" source links " + i + " [" + url_string +"]");
460
461 if (verbosity_ >= 4) {
462 System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
463 }
464
465 if (image_file_extension(url_string))
466 {
467 if (filter_image(url_string))
468 {
469 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
470
471 if (verbosity_ >= 2) {
472 System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
473 }
474
475 if ((external_links_ != null) && (!external_links_.isEmpty())) {
476 String ext = (String) external_links_.get(img_name);
477
478
479 if (ext != null){
480 add_image(url, ext, img_name);
481
482 }
483 else{
484
485 add_image(url, new_url, img_name);
486 }
487 }
488 else {
489
490 add_image(url, new_url, img_name);
491 }
492
493
494 }
495
496 }
497
498 }
499
500 // get all the <code><a href=</code> links into a vector
501 Vector href_links = curl.getHrefLinks();
502
503 if (verbosity_ >= 2) {
504 System.err.println(" Got href links... there are " + href_links.size() + " of them.");
505 }
506
507
508 // process each of the href links according to the parameters given.
509 for (int i = 0; i < href_links.size(); i++)
510 {
511
512 URL url = (URL)href_links.get(i);
513 String url_string = url.toString();
514 //System.err.println(" href links " + i + "[" + url_string +"]");
515
516 if (image_file_extension(url_string))
517 {
518
519 if (filter_image(url_string))
520
521 {
522
523 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
524 if (verbosity_ >= 2) {
525 System.err.println(" Filtered: href_link[" + i + "] = " + url_string);
526 }
527 if ((external_links_ != null) && (!external_links_.isEmpty())) {
528 String ext = (String) external_links_.get(img_name);
529
530 if (ext != null)
531 add_image(url, ext, img_name);
532 else
533 add_image(url, new_url, img_name);
534 }
535 else {
536 add_image(url, url_string, img_name);
537 }
538 }
539 }
540 else
541 {
542 if (filter_href(url_string,new_url,depth))
543 {
544
545 rec_add_images(url_string,depth+1);
546
547 }
548 }
549 }
550 }
551
552 else {
553 System.err.println("Unable able to download "+new_url);
554 }
555 }
556
557
558 /** Used in cases where the image maps to a url outside of it's original location.
559 * When used with Greenstone the collage images will refer to documents in the collections
560 * from which the images are sourced. When used individually, the images may be saved into
561 * a user directory and the pages they reference may be external hyperlinks.
562 * This function reads that external links file and creates a hash map of the image to
563 * its external hyperlink. If the file does not exist the download thread will continue
564 * and assume the first case, that links are internal. */
565 public void externalLinks() {
566 external_links_ = null;
567 try {
568
569 if (starting_url_ == null || (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
570 if (verbosity_ >= 3) {
571 System.err.println("**** " + starting_url_ + " is not an external link.");
572 }
573 return;
574 }
575
576 // open a url to the file written
577 URL u = new URL(starting_url_ + "externallinks");
578
579 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
580
581 external_links_ = new Hashtable();
582
583 String l = r.readLine();
584 // split the line of the space, first part is the image, second part the link
585 while (l != null) {
586
587 String tmp1 = new String();
588 String tmp2 = new String();
589
590 if (l.indexOf(" ") >= 0) {
591
592 tmp1 = l.substring(0, l.indexOf(" "));
593 if (l.length() > l.indexOf(" ") + 1)
594 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
595 else
596 tmp2 = null;
597
598 if (tmp2 != null) {
599
600 external_links_.put(tmp1, tmp2);
601 //System.err.println(tmp1 + " " + tmp2);
602 }
603 }
604 l = r.readLine();
605 }
606
607 r.close();
608
609 } catch (Exception e) {
610 e.printStackTrace();
611 return;
612 }
613 }
614
615 /** Controls the download thread */
616 public void run ()
617 {
618 System.err.println("Starting download thread.");
619 visited_url_ = new Hashtable();
620 visited_images_ = new Hashtable();
621
622 rec_add_images(starting_url_,1);
623 download_images_.stopDownload();
624 System.err.println("Download thread finished.");
625 }
626}
Note: See TracBrowser for help on using the repository browser.