source: tags/gsdl-2_70u-distribution/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11745

Last change on this file since 11745 was 11715, checked in by kjdon, 18 years ago

committed Shaoquns version of the applet for the branch - version 1.7

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8import javax.swing.ImageIcon; //****
9
10
11/**
12 * @author Katrina Edgar
13 * @author David Bainbridge
14 *
15 * Controls retrieval of images from the specified starting url. Follows appropriate
16 * links from this starting point, traversing in a tree-like state through several other
17 * pages. Filters images and links based on specified parameters. Also controls the quantity
18 * of downloading that occurs by restricting the number of downloaded images that are yet to
19 * be displayed to 10, and the total number of downloads allowed is also restricted by
20 * the applet application (to prevent downloading occuring infinitely). */
21
22public class DownloadUrls extends Thread {
23
24 /** Refers to applet */
25 GsdlCollageApplet app_ = null;
26 /** Refers to download thread */
27 DownloadImages download_images_ = null;
28
29 /** The address from which the application should start looking for images */
30 String starting_url_ = null;
31
32 /** the root directory of Greenstone*/
33 String document_root_ = null;
34
35
36 /** CHRIS - Holds the contents of the collection's assoc directory */
37 // File[] assocDir_ = null;
38
39 /** Restricts links followed from the starting url to links that contain this string */
40 String href_musthave_ = null;
41 /** Restricts links followed from the starting url to links that do not contain this string.
42 * Also prevents image names from containing this string */
43 String image_mustnothave_ = null;
44 /** Ignore images whose names begin with this string */
45 String image_ignore_ = null;
46 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47 String image_type_ = null;
48
49 /** A static delay used when attempting to download more images into a full downloading buffer */
50 final int delay_ = 3000;
51 /** The maximum number of images to have downloaded and not yet displayed */
52 final int buffer_size_ = 1;
53
54 /** Used in cases where the image maps to a url outside of it's original location.
55 * When used with Greenstone the collage images will refer to documents in the collections
56 * from which the images are sourced. When used individually, the images may be saved into
57 * a user directory and the pages they reference may be external hyperlinks. */
58 Hashtable external_links_ = null;
59
60 /** Records all urls which have already been examined */
61 Hashtable visited_url_ = null;
62 /** Determines whether there are still pages to examine and images to download */
63 boolean thread_running_ = true;
64
65 int verbosity_ = 0;
66
67 /** Records all images which have already been examined */
68 Hashtable visited_images_ = null;
69
70 MediaTracker tracker;
71
72 /** Constructor to initialise a download thread from which images are found,
73 * saves parameters into local variables for use within the class.
74 *
75 * @param app reference to the applet
76 * @param download_images class which stores the images retrieved in triplets
77 * @param starting_url the url from which the search for images should begin
78 * @param href_musthave restricts links to only those containing this string
79 * @param image_mustnothave restricts links and image names to only those that don't contain this string
80 * @param image_ignore restricts the beginning of image names
81 * @param image_type restricts the type of images included in the collage to those named */
82 public DownloadUrls(GsdlCollageApplet app,
83 DownloadImages download_images, String starting_url,
84 String href_musthave, String image_mustnothave,
85 String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
86 {
87 super("DownloadUrls");
88 app_ = app;
89 download_images_ = download_images;
90
91 starting_url_ = starting_url;
92 href_musthave_ = href_musthave;
93 image_mustnothave_ = image_mustnothave;
94 image_ignore_ = image_ignore;
95 image_type_ = image_type;
96 document_root_ = document_root;
97 verbosity_ = verbosity;
98 tracker = trk;
99
100 System.err.println("starting_url_ " + starting_url +"\n"+
101 "href_musthave_ " + href_musthave +"\n"+
102 "image_mustnothave_ " + image_mustnothave+"\n"+
103 "image_ignore_ "+ image_ignore+"\n"+
104 "image_type_ "+ image_type+"\n"+
105 "document root "+ document_root_
106 );
107
108
109
110
111 }
112
113
114 /** Determines whether or not a url has already been examined
115 *
116 * @param url_string the url to check
117 * @return true if the url has been visited, false if not */
118 public boolean already_visited(String url_string)
119 {
120 int hash_pos = url_string.indexOf("#");
121 if (hash_pos>0)
122 {
123 // strip off #anchor reference
124 url_string = url_string.substring(0,hash_pos);
125 }
126
127 // if the url has been visited before, return true
128 if (visited_url_.containsKey(url_string))
129 {
130 if (verbosity_ > 3)
131 {
132 System.err.println("Visited " + url_string + " before!");
133 }
134 return true;
135 }
136
137 visited_url_.put(url_string,"visited");
138
139 return false;
140 }
141
142 /** Determines whether or not an images or its screenview has been visited)
143 * has already been examined
144 *
145 * @param url_string the url to check
146 * @param img_name the image to check
147 * @return true if the url has been visited, false if not */
148 public boolean image_visited(String url_string, String img_name)
149 {
150 String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
151
152 if ( visited_images_.containsKey(hash_dir)){
153 Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
154
155 if (img_name.startsWith("screenview")){
156 return true;
157 }
158
159 if (hashed_images.containsKey(img_name)){
160 return true;
161 }
162
163 Enumeration enu = hashed_images.keys();
164 for(;enu.hasMoreElements();){
165 String name = (String)enu.nextElement();
166 if(name.startsWith("screenview")){
167 return true;
168 }
169 }
170
171 hashed_images.put(img_name,"visited");
172 }
173 else{
174 Hashtable hashed_images = new Hashtable();
175 hashed_images.put(img_name,"visited");
176 visited_images_.put(hash_dir,hashed_images);
177 }
178
179 return false;
180 }
181
182
183
184
185
186 /** Restricts the type of images that can be included in the collage
187 *
188 * @param url_string the url to check
189 * @return true if the image is of a specified type, false if not */
190 public boolean image_file_extension(String url_string)
191 {
192 // lower case comparisons
193 String url_lstring = url_string.toLowerCase();
194
195 if (image_type_ == null)
196 return true;
197
198 String tmp = image_type_;
199 String original_image_type_ = image_type_;
200
201 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
202
203 tmp = image_type_.substring(0, image_type_.indexOf("%"));
204
205 if (image_type_.length() > image_type_.indexOf("%") + 1)
206 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
207 else
208 image_type_ = null;
209
210 if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
211 image_type_ = original_image_type_;
212 return true;
213 }
214 }
215
216 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
217 image_type_ = original_image_type_;
218 return true;
219 }
220
221 image_type_ = original_image_type_;
222 return false;
223 }
224
225 /** Restricts images to only those that satisfy several specified conditions
226 * regarding the content of the image name and url.
227 *
228 * @param url_string the url to check
229 * @return true if the image is satisfactory, false if not */
230 public boolean filter_image(String url_string)
231 {
232
233 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
234 {
235 if (!already_visited(url_string))
236 {
237 if (image_mustnothave_ != null) {
238
239 String tmp = image_mustnothave_;
240 String original_image_mustnothave_ = image_mustnothave_;
241
242 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
243
244 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
245 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
246 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
247 image_mustnothave_.length());
248 else
249 image_mustnothave_ = null;
250
251
252
253 if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
254
255 image_mustnothave_ = original_image_mustnothave_;
256 return false;
257 }
258 }
259
260 image_mustnothave_ = original_image_mustnothave_;
261
262 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
263 image_mustnothave_ = original_image_mustnothave_;
264 return false;
265 }
266
267 if (verbosity_ > 2) {
268 System.err.println("src url = "+ url_string);
269 }
270
271 image_mustnothave_ = original_image_mustnothave_;
272
273 }
274
275 }
276
277 }
278
279 return true;
280 }
281
282 /** Restricts links to only those that satisfy several specified conditions
283 * regarding the address of the link.
284 *
285 * @param url_string the url to check
286 * @param new_url_string the url from which this link was found
287 * @param depth the number of links followed on this path
288 * @return true if the image is satisfactory, false if not */
289 public boolean filter_href(String url_string, String new_url_string, int depth)
290 {
291 boolean has_href = false;
292 String tmp = href_musthave_;
293 String original_href_musthave_ = href_musthave_;
294
295 // checks that it does contain this content
296 if (href_musthave_ != null) {
297
298 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
299
300 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
301 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
302 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
303 else
304 href_musthave_ = null;
305
306 if (url_string.indexOf(tmp) >= 0)
307 has_href = true;
308 }
309
310 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
311 has_href = true;
312
313 href_musthave_ = original_href_musthave_;
314 }
315
316 tmp = image_mustnothave_;
317 String original_image_mustnothave_ = image_mustnothave_;
318
319 // checks that it doesn't contain this content
320 if (image_mustnothave_ != null) {
321
322 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
323
324 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
325 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
326 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
327 else
328 image_mustnothave_ = null;
329
330 if (url_string.indexOf(tmp) >= 0)
331 has_href = false;
332 }
333 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
334 has_href = false;
335
336 image_mustnothave_ = original_image_mustnothave_;
337 }
338
339 // return true if the link is valid and false if not
340 if (href_musthave_==null || has_href)
341 {
342 // might be another URL
343 if (depth < app_.maxDepth())
344 {
345 if (!new_url_string.startsWith(url_string))
346 {
347 return true;
348 }
349 }
350 }
351 return false;
352 }
353
354 /** Adds an image to the stored downloaded images as a triplet.
355 * Ensures that the number of images downloaded but not displayed at
356 * any one time is controlled by using a buffer. If the buffer is
357 * full this function will wait until space becomes available before
358 * continuing. It also restricts the
359 * total number of images to download as specified by the applet.
360 *
361 * @param url the image to download
362 * @param from_url the url that this image was sourced from
363 * @param img_name the name of the image */
364 public void add_image(URL url, String from_url, String img_name)
365 {
366 // get the image from the url
367 if (verbosity_>=2) {
368 System.err.println(" Downloading image URL: " + url.toString());
369 }
370
371 if (image_visited(url.toString(),img_name)) return;
372
373 int size = download_images_.downloadImage(tracker,url, from_url, img_name);
374
375 try{
376 // if have completed the maximum number of downloads for the
377 // application then stop
378 if (size == app_.maxDownloads()) {
379 stop();
380 }
381
382 }
383 catch (Exception e) {
384 thread_running_ = false;
385 stop();
386 e.printStackTrace();
387 }
388 }
389
390 /** Connects to the starting url and looks for all images and links from this
391 * original page. Image links are processed first, so that any images found can be
392 * downloaded immediately and placed on the applet. Secondly, the links to other
393 * pages are recursively processed by this function and treated as a starting url
394 *
395 * @param new_url the url from which to start searching for images and links
396 * @param depth the number of links that have been followed on this path */
397 public void rec_add_images(String new_url, int depth)
398 {
399
400 if (already_visited(new_url)) return;
401
402 // check if there is a scenario where external hyperlinks are being used
403 externalLinks();
404 String img_name = new String();
405
406 // connect to the url
407 CURL curl = new CURL(new_url);
408 if (curl.connected_ok())
409 {
410 if (verbosity_ >= 1) {
411 System.err.print("Connected OK ... ");
412 }
413
414 // read the page
415 curl.readAll();
416 if (verbosity_ >= 1) {
417 System.err.println("URL read.");
418 }
419
420 // get all the <code><img src=</code> links into a vector
421 Vector src_links = curl.getSrcLinks();
422
423
424 if (verbosity_ >= 2) {
425 System.err.println(" Got src links... there are " + src_links.size() + " of them.");
426 }
427 // process each of the image links according to the parameters given.
428 for (int i = 0; i < src_links.size(); i++)
429 {
430 URL url = (URL)src_links.get(i);
431 String url_string = url.toString();
432
433 //System.err.println(" source links " + i + " [" + url_string +"]");
434
435 if (verbosity_ >= 4) {
436 System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
437 }
438
439 if (image_file_extension(url_string))
440 {
441 if (filter_image(url_string))
442 {
443 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
444
445 if (verbosity_ >= 2) {
446 System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
447 }
448
449 if ((external_links_ != null) && (!external_links_.isEmpty())) {
450 String ext = (String) external_links_.get(img_name);
451
452
453 if (ext != null){
454 add_image(url, ext, img_name);
455
456 }
457 else{
458
459 add_image(url, new_url, img_name);
460 }
461 }
462 else {
463
464 add_image(url, new_url, img_name);
465 }
466
467
468 }
469
470 }
471
472 }
473
474 // get all the <code><a href=</code> links into a vector
475 Vector href_links = curl.getHrefLinks();
476
477
478 if (verbosity_ >= 2) {
479 System.err.println(" Got href links... there are " + href_links.size() + " of them.");
480 }
481
482
483 // process each of the href links according to the parameters given.
484 for (int i = 0; i < href_links.size(); i++)
485 {
486
487 URL url = (URL)href_links.get(i);
488 String url_string = url.toString();
489 //System.err.println(" href links " + i + "[" + url_string +"]");
490
491 if (image_file_extension(url_string))
492 {
493
494 if (filter_image(url_string))
495
496 {
497
498 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
499
500 if ((external_links_ != null) && (!external_links_.isEmpty())) {
501 String ext = (String) external_links_.get(img_name);
502
503 if (ext != null)
504 add_image(url, ext, img_name);
505 else
506 add_image(url, new_url, img_name);
507 }
508 else {
509 add_image(url, url_string, img_name);
510 }
511 }
512 }
513 else
514 {
515 if (filter_href(url_string,new_url,depth))
516 {
517
518 rec_add_images(url_string,depth+1);
519
520 }
521 }
522 }
523 }
524
525 else {
526 System.err.println("Unable able to download "+new_url);
527 }
528 }
529
530
531 /** Used in cases where the image maps to a url outside of it's original location.
532 * When used with Greenstone the collage images will refer to documents in the collections
533 * from which the images are sourced. When used individually, the images may be saved into
534 * a user directory and the pages they reference may be external hyperlinks.
535 * This function reads that external links file and creates a hash map of the image to
536 * its external hyperlink. If the file does not exist the download thread will continue
537 * and assume the first case, that links are internal. */
538 public void externalLinks() {
539 external_links_ = null;
540 try {
541
542 if (starting_url_ == null || (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
543 return;
544 }
545
546 // open a url to the file written
547 URL u = new URL(starting_url_ + "externallinks");
548
549 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
550
551 external_links_ = new Hashtable();
552
553 String l = r.readLine();
554 // split the line of the space, first part is the image, second part the link
555 while (l != null) {
556
557 String tmp1 = new String();
558 String tmp2 = new String();
559
560 if (l.indexOf(" ") >= 0) {
561
562 tmp1 = l.substring(0, l.indexOf(" "));
563 if (l.length() > l.indexOf(" ") + 1)
564 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
565 else
566 tmp2 = null;
567
568 if (tmp2 != null) {
569
570 external_links_.put(tmp1, tmp2);
571 //System.err.println(tmp1 + " " + tmp2);
572 }
573 }
574 l = r.readLine();
575 }
576
577 r.close();
578
579 } catch (Exception e) {
580 e.printStackTrace();
581 return;
582 }
583 }
584
585 /** Controls the download thread */
586 public void run ()
587 {
588 System.err.println("Starting download thread.");
589 visited_url_ = new Hashtable();
590 visited_images_ = new Hashtable();
591
592 rec_add_images(starting_url_,1);
593 download_images_.stopDownload();
594 System.err.println("Download thread finished.");
595 }
596}
Note: See TracBrowser for help on using the repository browser.