source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11472

Last change on this file since 11472 was 11472, checked in by shaoqun, 18 years ago

set external_links_ = null if exception occurs

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8// import javax.swing.ImageIcon; //****
9
10
11/**
12 * @author Katrina Edgar
13 * @author David Bainbridge
14 *
15 * Controls retrieval of images from the specified starting url. Follows appropriate
16 * links from this starting point, traversing in a tree-like state through several other
17 * pages. Filters images and links based on specified parameters. Also controls the quantity
18 * of downloading that occurs by restricting the number of downloaded images that are yet to
19 * be displayed to 10, and the total number of downloads allowed is also restricted by
20 * the applet application (to prevent downloading occuring infinitely). */
21
22public class DownloadUrls extends Thread {
23
24 /** Refers to applet */
25 GsdlCollageApplet app_ = null;
26 /** Refers to download thread */
27 DownloadImages download_images_ = null;
28
29 /** The address from which the application should start looking for images */
30 String starting_url_ = null;
31
32 /** the root directory of Greenstone*/
33 String document_root_ = null;
34
35
36 /** CHRIS - Holds the contents of the collection's assoc directory */
37 // File[] assocDir_ = null;
38
39 /** Restricts links followed from the starting url to links that contain this string */
40 String href_musthave_ = null;
41 /** Restricts links followed from the starting url to links that do not contain this string.
42 * Also prevents image names from containing this string */
43 String image_mustnothave_ = null;
44 /** Ignore images whose names begin with this string */
45 String image_ignore_ = null;
46 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47 String image_type_ = null;
48
49 /** A static delay used when attempting to download more images into a full downloading buffer */
50 final int delay_ = 1000;
51 /** The maximum number of images to have downloaded and not yet displayed */
52 final int buffer_size_ = 10;
53
54 /** Used in cases where the image maps to a url outside of it's original location.
55 * When used with Greenstone the collage images will refer to documents in the collections
56 * from which the images are sourced. When used individually, the images may be saved into
57 * a user directory and the pages they reference may be external hyperlinks. */
58 Hashtable external_links_ = null;
59
60 /** Records all urls which have already been examined */
61 Hashtable visited_url_ = null;
62 /** Determines whether there are still pages to examine and images to download */
63 boolean thread_running_ = true;
64
65 int verbosity_ = 0;
66
67 /** Constructor to initialise a download thread from which images are found,
68 * saves parameters into local variables for use within the class.
69 *
70 * @param app reference to the applet
71 * @param download_images class which stores the images retrieved in triplets
72 * @param starting_url the url from which the search for images should begin
73 * @param href_musthave restricts links to only those containing this string
74 * @param image_mustnothave restricts links and image names to only those that don't contain this string
75 * @param image_ignore restricts the beginning of image names
76 * @param image_type restricts the type of images included in the collage to those named */
77 public DownloadUrls(GsdlCollageApplet app,
78 DownloadImages download_images, String starting_url,
79 String href_musthave, String image_mustnothave,
80 String image_ignore, String image_type, String document_root,int verbosity)
81 {
82 super("DownloadUrls");
83 app_ = app;
84 download_images_ = download_images;
85
86 starting_url_ = starting_url;
87 href_musthave_ = href_musthave;
88 image_mustnothave_ = image_mustnothave;
89 image_ignore_ = image_ignore;
90 image_type_ = image_type;
91 document_root_ = document_root;
92 verbosity_ = verbosity;
93
94
95 System.err.println("starting_url_ " + starting_url +"\n"+
96 "href_musthave_ " + href_musthave +"\n"+
97 "image_mustnothave_" + image_mustnothave+"\n"+
98 "image_ignore_ "+ image_ignore+"\n"+
99 "image_type_ "+ image_type+"\n"+
100 "document root "+ document_root_
101 );
102
103
104
105
106 }
107
108
109
110 /** Determines whether or not a url has already been examined
111 *
112 * @param url_string the url to check
113 * @return true if the url has been visited, false if not */
114 public boolean already_visited(String url_string)
115 {
116 int hash_pos = url_string.indexOf("#");
117 if (hash_pos>0)
118 {
119 // strip off #anchor reference
120 url_string = url_string.substring(0,hash_pos);
121 }
122
123 // if the url has been visited before, return true
124 if (visited_url_.containsKey(url_string))
125 {
126 if (verbosity_ > 3)
127 {
128 System.err.println("Visited " + url_string + " before!");
129 }
130 return true;
131 }
132
133 visited_url_.put(url_string,"visited");
134
135 return false;
136 }
137
138 /** Restricts the type of images that can be included in the collage
139 *
140 * @param url_string the url to check
141 * @return true if the image is of a specified type, false if not */
142 public boolean image_file_extension(String url_string)
143 {
144 // lower case comparisons
145 String url_lstring = url_string.toLowerCase();
146
147 if (image_type_ == null)
148 return true;
149
150 String tmp = image_type_;
151 String original_image_type_ = image_type_;
152
153 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
154
155 tmp = image_type_.substring(0, image_type_.indexOf("%"));
156
157 if (image_type_.length() > image_type_.indexOf("%") + 1)
158 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
159 else
160 image_type_ = null;
161
162 if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
163 image_type_ = original_image_type_;
164 return true;
165 }
166 }
167
168 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
169 image_type_ = original_image_type_;
170 return true;
171 }
172
173 image_type_ = original_image_type_;
174 return false;
175 }
176
177 /** Restricts images to only those that satisfy several specified conditions
178 * regarding the content of the image name and url.
179 *
180 * @param url_string the url to check
181 * @return true if the image is satisfactory, false if not */
182 public boolean filter_image(String url_string)
183 {
184
185 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
186 {
187 if (!already_visited(url_string))
188 {
189 if (image_mustnothave_ != null) {
190
191 String tmp = image_mustnothave_;
192 String original_image_mustnothave_ = image_mustnothave_;
193
194 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
195
196 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
197 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
198 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
199 image_mustnothave_.length());
200 else
201 image_mustnothave_ = null;
202
203
204
205 if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
206
207 image_mustnothave_ = original_image_mustnothave_;
208 return false;
209 }
210 }
211
212 image_mustnothave_ = original_image_mustnothave_;
213
214 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
215 image_mustnothave_ = original_image_mustnothave_;
216 return false;
217 }
218
219 if (verbosity_ > 2) {
220 System.err.println("src url = "+ url_string);
221 }
222
223 image_mustnothave_ = original_image_mustnothave_;
224
225 }
226
227 }
228
229 }
230
231 return true;
232 }
233
234 /** Restricts links to only those that satisfy several specified conditions
235 * regarding the address of the link.
236 *
237 * @param url_string the url to check
238 * @param new_url_string the url from which this link was found
239 * @param depth the number of links followed on this path
240 * @return true if the image is satisfactory, false if not */
241 public boolean filter_href(String url_string, String new_url_string, int depth)
242 {
243 boolean has_href = false;
244 String tmp = href_musthave_;
245 String original_href_musthave_ = href_musthave_;
246
247 // checks that it does contain this content
248 if (href_musthave_ != null) {
249
250 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
251
252 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
253 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
254 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
255 else
256 href_musthave_ = null;
257
258 if (url_string.indexOf(tmp) >= 0)
259 has_href = true;
260 }
261
262 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
263 has_href = true;
264
265 href_musthave_ = original_href_musthave_;
266 }
267
268 tmp = image_mustnothave_;
269 String original_image_mustnothave_ = image_mustnothave_;
270
271 // checks that it doesn't contain this content
272 if (image_mustnothave_ != null) {
273
274 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
275
276 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
277 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
278 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
279 else
280 image_mustnothave_ = null;
281
282 if (url_string.indexOf(tmp) >= 0)
283 has_href = false;
284 }
285 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
286 has_href = false;
287
288 image_mustnothave_ = original_image_mustnothave_;
289 }
290
291 // return true if the link is valid and false if not
292 if (href_musthave_==null || has_href)
293 {
294 // might be another URL
295 if (depth < app_.maxDepth())
296 {
297 if (!new_url_string.startsWith(url_string))
298 {
299 return true;
300 }
301 }
302 }
303 return false;
304 }
305
306 /** Adds an image to the stored downloaded images as a triplet.
307 * Ensures that the number of images downloaded but not displayed at
308 * any one time is controlled by using a buffer. If the buffer is
309 * full this function will wait until space becomes available before
310 * continuing. It also restricts the
311 * total number of images to download as specified by the applet.
312 *
313 * @param url the image to download
314 * @param from_url the url that this image was sourced from
315 * @param img_name the name of the image */
316 public void add_image(URL url, String from_url, String img_name)
317 {
318 try {
319
320 boolean had_to_wait = false;
321
322 // ensure that we don't download too many images
323 while (download_images_.size() >= buffer_size_)
324 {
325 had_to_wait = true;
326 Thread.sleep(delay_);
327 }
328
329 // get the image from the url
330 if (verbosity_>=2) {
331 System.err.println(" Downloading image URL: " + url.toString());
332 }
333
334 //ImageIcon image_icon = new ImageIcon(url);
335 // Image image = image_icon.getImage();
336
337 // Image image = Toolkit.getDefaultToolkit().createImage(url);
338 // Image image = app_.getImage(url);
339 Image image = Toolkit.getDefaultToolkit().getImage(url);
340
341 //System.err.println("###DownloadingED image URL: " + url.toString());
342
343 boolean status = app_.prepareImage(image,app_);
344 //System.err.println(" Prepare Image status = " + status);
345
346 Thread.sleep(100);
347
348 // push image onto the downloaded images
349 /* System.err.println("*** Pushing: name="+img_name
350 +" dimensions = "+image_icon.getIconWidth()+"x"+image_icon.getIconHeight());
351 // +" dimensions = "+image.getWidth(app_)+"x"+image.getHeight(app_));
352 */
353
354 download_images_.push(image,from_url, img_name);
355
356 // if have completed the maximum number of downloads for the
357 // application then stop
358 if (download_images_.size() == app_.maxDownloads()) {
359 stop();
360 }
361
362 }
363 catch (Exception e) {
364 thread_running_ = false;
365 stop();
366 e.printStackTrace();
367 }
368 }
369
370 /** Connects to the starting url and looks for all images and links from this
371 * original page. Image links are processed first, so that any images found can be
372 * downloaded immediately and placed on the applet. Secondly, the links to other
373 * pages are recursively processed by this function and treated as a starting url
374 *
375 * @param new_url the url from which to start searching for images and links
376 * @param depth the number of links that have been followed on this path */
377 public void rec_add_images(String new_url, int depth)
378 {
379
380
381 System.err.println("Parsing url = " + new_url);
382
383 if (already_visited(new_url)) return;
384
385 // check if there is a scenario where external hyperlinks are being used
386 externalLinks();
387 String img_name = new String();
388
389 // connect to the url
390 CURL curl = new CURL(new_url);
391 if (curl.connected_ok())
392 {
393 if (verbosity_ >= 1) {
394 System.err.print("Connected OK ... ");
395 }
396
397 // read the page
398 curl.readAll();
399 if (verbosity_ >= 1) {
400 System.err.println("URL read.");
401 }
402
403 // get all the <code><img src=</code> links into a vector
404 Vector src_links = curl.getSrcLinks();
405
406 if (verbosity_ >= 2) {
407 System.err.println(" Got src links... there are " + src_links.size() + " of them.");
408 }
409 // process each of the image links according to the parameters given.
410 for (int i = 0; i < src_links.size(); i++)
411 {
412 URL url = (URL)src_links.elementAt(i);
413 String url_string = url.toString();
414
415 //System.err.println(" source links " + i + " [" + url_string +"]");
416
417 if (verbosity_ >= 3) {
418 System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
419 }
420
421 if (image_file_extension(url_string))
422 {
423 if (filter_image(url_string))
424 {
425 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
426
427 if (verbosity_ >= 2) {
428 System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
429 }
430
431 if ((external_links_ != null) && (!external_links_.isEmpty())) {
432 String ext = (String) external_links_.get(img_name);
433
434
435 if (ext != null){
436 add_image(url, ext, img_name);
437
438 }
439 else{
440
441 add_image(url, new_url, img_name);
442 }
443 }
444 else {
445
446 add_image(url, new_url, img_name);
447 }
448
449
450 }
451
452 }
453
454 }
455
456 // get all the <code><a href=</code> links into a vector
457 Vector href_links = curl.getHrefLinks();
458
459
460 if (verbosity_ >= 2) {
461 System.err.println(" Got href links... there are " + href_links.size() + " of them.");
462 }
463
464
465 // process each of the href links according to the parameters given.
466 for (int i = 0; i < href_links.size(); i++)
467 {
468 URL url = (URL)href_links.elementAt(i);
469 String url_string = url.toString();
470
471 //System.err.println(" href links " + i + "[" + url_string +"]");
472
473
474 if (image_file_extension(url_string))
475 {
476
477 if (filter_image(url_string))
478
479 {
480 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
481
482 if ((external_links_ != null) && (!external_links_.isEmpty())) {
483 String ext = (String) external_links_.get(img_name);
484
485 if (ext != null)
486 add_image(url, ext, img_name);
487 else
488 add_image(url, new_url, img_name);
489 }
490 else {
491 add_image(url, url_string, img_name);
492 }
493 }
494 }
495 else
496 {
497 if (filter_href(url_string,new_url,depth))
498 {
499
500 System.out.println("*************************************");
501 rec_add_images(url_string,depth+1);
502
503 }
504 }
505 }
506 }
507
508 else {
509 System.err.println("Unable able to download "+new_url);
510 }
511 }
512
513
514 /** Used in cases where the image maps to a url outside of it's original location.
515 * When used with Greenstone the collage images will refer to documents in the collections
516 * from which the images are sourced. When used individually, the images may be saved into
517 * a user directory and the pages they reference may be external hyperlinks.
518 * This function reads that external links file and creates a hash map of the image to
519 * its external hyperlink. If the file does not exist the download thread will continue
520 * and assume the first case, that links are internal. */
521 public void externalLinks() {
522 external_links_ = null;
523
524 try {
525
526 if (starting_url_.indexOf(document_root_) >= 0 ){
527 return;
528 }
529
530 // open a url to the file written
531 URL u = new URL(starting_url_ + "externallinks");
532
533 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
534
535 external_links_ = new Hashtable();
536
537 String l = r.readLine();
538 // split the line of the space, first part is the image, second part the link
539 while (l != null) {
540
541 String tmp1 = new String();
542 String tmp2 = new String();
543
544 if (l.indexOf(" ") >= 0) {
545
546 tmp1 = l.substring(0, l.indexOf(" "));
547 if (l.length() > l.indexOf(" ") + 1)
548 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
549 else
550 tmp2 = null;
551
552 if (tmp2 != null) {
553 external_links_.put(tmp1, tmp2);
554 //System.err.println(tmp1 + " " + tmp2);
555 }
556 }
557 l = r.readLine();
558 }
559
560 r.close();
561
562 } catch (Exception e) {
563 e.printStackTrace();
564 return;
565 }
566 }
567
568 /** Controls the download thread */
569 public void run ()
570 {
571 System.err.println("Starting download thread.");
572 visited_url_ = new Hashtable();
573
574 rec_add_images(starting_url_,1);
575
576 System.err.println("Download thread finished.");
577 }
578}
Note: See TracBrowser for help on using the repository browser.