source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11563

Last change on this file since 11563 was 11563, checked in by shaoqun, 18 years ago

changed the code handling threads to make it threads safe

  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8import javax.swing.ImageIcon; //****
9
10
11/**
12 * @author Katrina Edgar
13 * @author David Bainbridge
14 *
15 * Controls retrieval of images from the specified starting url. Follows appropriate
16 * links from this starting point, traversing in a tree-like state through several other
17 * pages. Filters images and links based on specified parameters. Also controls the quantity
18 * of downloading that occurs by restricting the number of downloaded images that are yet to
19 * be displayed to 10, and the total number of downloads allowed is also restricted by
20 * the applet application (to prevent downloading occuring infinitely). */
21
22public class DownloadUrls extends Thread {
23
24 /** Refers to applet */
25 GsdlCollageApplet app_ = null;
26 /** Refers to download thread */
27 DownloadImages download_images_ = null;
28
29 /** The address from which the application should start looking for images */
30 String starting_url_ = null;
31
32 /** the root directory of Greenstone*/
33 String document_root_ = null;
34
35
36 /** CHRIS - Holds the contents of the collection's assoc directory */
37 // File[] assocDir_ = null;
38
39 /** Restricts links followed from the starting url to links that contain this string */
40 String href_musthave_ = null;
41 /** Restricts links followed from the starting url to links that do not contain this string.
42 * Also prevents image names from containing this string */
43 String image_mustnothave_ = null;
44 /** Ignore images whose names begin with this string */
45 String image_ignore_ = null;
46 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47 String image_type_ = null;
48
49 /** A static delay used when attempting to download more images into a full downloading buffer */
50 final int delay_ = 3000;
51 /** The maximum number of images to have downloaded and not yet displayed */
52 final int buffer_size_ = 1;
53
54 /** Used in cases where the image maps to a url outside of it's original location.
55 * When used with Greenstone the collage images will refer to documents in the collections
56 * from which the images are sourced. When used individually, the images may be saved into
57 * a user directory and the pages they reference may be external hyperlinks. */
58 Hashtable external_links_ = null;
59
60 /** Records all urls which have already been examined */
61 Hashtable visited_url_ = null;
62 /** Determines whether there are still pages to examine and images to download */
63 boolean thread_running_ = true;
64
65 int verbosity_ = 0;
66
67 protected boolean busy_ = false;
68
69 MediaTracker tracker;
70
71 /** Constructor to initialise a download thread from which images are found,
72 * saves parameters into local variables for use within the class.
73 *
74 * @param app reference to the applet
75 * @param download_images class which stores the images retrieved in triplets
76 * @param starting_url the url from which the search for images should begin
77 * @param href_musthave restricts links to only those containing this string
78 * @param image_mustnothave restricts links and image names to only those that don't contain this string
79 * @param image_ignore restricts the beginning of image names
80 * @param image_type restricts the type of images included in the collage to those named */
81 public DownloadUrls(GsdlCollageApplet app,
82 DownloadImages download_images, String starting_url,
83 String href_musthave, String image_mustnothave,
84 String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
85 {
86 super("DownloadUrls");
87 app_ = app;
88 download_images_ = download_images;
89
90 starting_url_ = starting_url;
91 href_musthave_ = href_musthave;
92 image_mustnothave_ = image_mustnothave;
93 image_ignore_ = image_ignore;
94 image_type_ = image_type;
95 document_root_ = document_root;
96 verbosity_ = verbosity;
97 tracker = trk;
98
99 System.err.println("starting_url_ " + starting_url +"\n"+
100 "href_musthave_ " + href_musthave +"\n"+
101 "image_mustnothave_" + image_mustnothave+"\n"+
102 "image_ignore_ "+ image_ignore+"\n"+
103 "image_type_ "+ image_type+"\n"+
104 "document root "+ document_root_
105 );
106
107
108
109
110 }
111
112 public boolean getStatus(){
113 return busy_;
114
115 }
116
117 /** Determines whether or not a url has already been examined
118 *
119 * @param url_string the url to check
120 * @return true if the url has been visited, false if not */
121 public boolean already_visited(String url_string)
122 {
123 int hash_pos = url_string.indexOf("#");
124 if (hash_pos>0)
125 {
126 // strip off #anchor reference
127 url_string = url_string.substring(0,hash_pos);
128 }
129
130 // if the url has been visited before, return true
131 if (visited_url_.containsKey(url_string))
132 {
133 if (verbosity_ > 3)
134 {
135 System.err.println("Visited " + url_string + " before!");
136 }
137 return true;
138 }
139
140 visited_url_.put(url_string,"visited");
141
142 return false;
143 }
144
145 /** Restricts the type of images that can be included in the collage
146 *
147 * @param url_string the url to check
148 * @return true if the image is of a specified type, false if not */
149 public boolean image_file_extension(String url_string)
150 {
151 // lower case comparisons
152 String url_lstring = url_string.toLowerCase();
153
154 if (image_type_ == null)
155 return true;
156
157 String tmp = image_type_;
158 String original_image_type_ = image_type_;
159
160 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
161
162 tmp = image_type_.substring(0, image_type_.indexOf("%"));
163
164 if (image_type_.length() > image_type_.indexOf("%") + 1)
165 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
166 else
167 image_type_ = null;
168
169 if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
170 image_type_ = original_image_type_;
171 return true;
172 }
173 }
174
175 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
176 image_type_ = original_image_type_;
177 return true;
178 }
179
180 image_type_ = original_image_type_;
181 return false;
182 }
183
184 /** Restricts images to only those that satisfy several specified conditions
185 * regarding the content of the image name and url.
186 *
187 * @param url_string the url to check
188 * @return true if the image is satisfactory, false if not */
189 public boolean filter_image(String url_string)
190 {
191
192 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
193 {
194 if (!already_visited(url_string))
195 {
196 if (image_mustnothave_ != null) {
197
198 String tmp = image_mustnothave_;
199 String original_image_mustnothave_ = image_mustnothave_;
200
201 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
202
203 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
204 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
205 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
206 image_mustnothave_.length());
207 else
208 image_mustnothave_ = null;
209
210
211
212 if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
213
214 image_mustnothave_ = original_image_mustnothave_;
215 return false;
216 }
217 }
218
219 image_mustnothave_ = original_image_mustnothave_;
220
221 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
222 image_mustnothave_ = original_image_mustnothave_;
223 return false;
224 }
225
226 if (verbosity_ > 2) {
227 System.err.println("src url = "+ url_string);
228 }
229
230 image_mustnothave_ = original_image_mustnothave_;
231
232 }
233
234 }
235
236 }
237
238 return true;
239 }
240
241 /** Restricts links to only those that satisfy several specified conditions
242 * regarding the address of the link.
243 *
244 * @param url_string the url to check
245 * @param new_url_string the url from which this link was found
246 * @param depth the number of links followed on this path
247 * @return true if the image is satisfactory, false if not */
248 public boolean filter_href(String url_string, String new_url_string, int depth)
249 {
250 boolean has_href = false;
251 String tmp = href_musthave_;
252 String original_href_musthave_ = href_musthave_;
253
254 // checks that it does contain this content
255 if (href_musthave_ != null) {
256
257 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
258
259 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
260 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
261 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
262 else
263 href_musthave_ = null;
264
265 if (url_string.indexOf(tmp) >= 0)
266 has_href = true;
267 }
268
269 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
270 has_href = true;
271
272 href_musthave_ = original_href_musthave_;
273 }
274
275 tmp = image_mustnothave_;
276 String original_image_mustnothave_ = image_mustnothave_;
277
278 // checks that it doesn't contain this content
279 if (image_mustnothave_ != null) {
280
281 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
282
283 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
284 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
285 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
286 else
287 image_mustnothave_ = null;
288
289 if (url_string.indexOf(tmp) >= 0)
290 has_href = false;
291 }
292 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
293 has_href = false;
294
295 image_mustnothave_ = original_image_mustnothave_;
296 }
297
298 // return true if the link is valid and false if not
299 if (href_musthave_==null || has_href)
300 {
301 // might be another URL
302 if (depth < app_.maxDepth())
303 {
304 if (!new_url_string.startsWith(url_string))
305 {
306 return true;
307 }
308 }
309 }
310 return false;
311 }
312
313 /** Adds an image to the stored downloaded images as a triplet.
314 * Ensures that the number of images downloaded but not displayed at
315 * any one time is controlled by using a buffer. If the buffer is
316 * full this function will wait until space becomes available before
317 * continuing. It also restricts the
318 * total number of images to download as specified by the applet.
319 *
320 * @param url the image to download
321 * @param from_url the url that this image was sourced from
322 * @param img_name the name of the image */
323 public void add_image(URL url, String from_url, String img_name)
324 {
325 // get the image from the url
326 if (verbosity_>=2) {
327 //System.err.println(" ****Downloading image URL: " + url.toString());
328 }
329
330 int size = download_images_.downloadImage(tracker,url, from_url, img_name);
331
332 try{
333 // if have completed the maximum number of downloads for the
334 // application then stop
335 if (size == app_.maxDownloads()) {
336 stop();
337 }
338
339 }
340 catch (Exception e) {
341 thread_running_ = false;
342 stop();
343 e.printStackTrace();
344 }
345 }
346
347 /** Connects to the starting url and looks for all images and links from this
348 * original page. Image links are processed first, so that any images found can be
349 * downloaded immediately and placed on the applet. Secondly, the links to other
350 * pages are recursively processed by this function and treated as a starting url
351 *
352 * @param new_url the url from which to start searching for images and links
353 * @param depth the number of links that have been followed on this path */
354 public void rec_add_images(String new_url, int depth)
355 {
356
357 if (already_visited(new_url)) return;
358
359 // check if there is a scenario where external hyperlinks are being used
360 externalLinks();
361 String img_name = new String();
362
363 // connect to the url
364 CURL curl = new CURL(new_url);
365 if (curl.connected_ok())
366 {
367 if (verbosity_ >= 1) {
368 System.err.print("Connected OK ... ");
369 }
370
371 // read the page
372 curl.readAll();
373 if (verbosity_ >= 1) {
374 System.err.println("URL read.");
375 }
376
377 // get all the <code><img src=</code> links into a vector
378 Vector src_links = curl.getSrcLinks();
379
380 if (verbosity_ >= 2) {
381 System.err.println(" Got src links... there are " + src_links.size() + " of them.");
382 }
383 // process each of the image links according to the parameters given.
384 for (int i = 0; i < src_links.size(); i++)
385 {
386 URL url = (URL)src_links.get(i);
387 String url_string = url.toString();
388
389 //System.err.println(" source links " + i + " [" + url_string +"]");
390
391 if (verbosity_ >= 4) {
392 System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
393 }
394
395 if (image_file_extension(url_string))
396 {
397 if (filter_image(url_string))
398 {
399 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
400
401 if (verbosity_ >= 2) {
402 System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
403 }
404
405 if ((external_links_ != null) && (!external_links_.isEmpty())) {
406 String ext = (String) external_links_.get(img_name);
407
408
409 if (ext != null){
410 add_image(url, ext, img_name);
411
412 }
413 else{
414
415 add_image(url, new_url, img_name);
416 }
417 }
418 else {
419
420 add_image(url, new_url, img_name);
421 }
422
423
424 }
425
426 }
427
428 }
429
430 // get all the <code><a href=</code> links into a vector
431 Vector href_links = curl.getHrefLinks();
432
433
434 if (verbosity_ >= 2) {
435 System.err.println(" Got href links... there are " + href_links.size() + " of them.");
436 }
437
438
439 // process each of the href links according to the parameters given.
440 for (int i = 0; i < href_links.size(); i++)
441 {
442 URL url = (URL)href_links.get(i);
443 String url_string = url.toString();
444
445 //System.err.println(" href links " + i + "[" + url_string +"]");
446
447
448 if (image_file_extension(url_string))
449 {
450
451 if (filter_image(url_string))
452
453 {
454 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
455
456 if ((external_links_ != null) && (!external_links_.isEmpty())) {
457 String ext = (String) external_links_.get(img_name);
458
459 if (ext != null)
460 add_image(url, ext, img_name);
461 else
462 add_image(url, new_url, img_name);
463 }
464 else {
465 add_image(url, url_string, img_name);
466 }
467 }
468 }
469 else
470 {
471 if (filter_href(url_string,new_url,depth))
472 {
473 rec_add_images(url_string,depth+1);
474
475 }
476 }
477 }
478 }
479
480 else {
481 System.err.println("Unable able to download "+new_url);
482 }
483 }
484
485
486 /** Used in cases where the image maps to a url outside of it's original location.
487 * When used with Greenstone the collage images will refer to documents in the collections
488 * from which the images are sourced. When used individually, the images may be saved into
489 * a user directory and the pages they reference may be external hyperlinks.
490 * This function reads that external links file and creates a hash map of the image to
491 * its external hyperlink. If the file does not exist the download thread will continue
492 * and assume the first case, that links are internal. */
493 public void externalLinks() {
494 external_links_ = null;
495
496 try {
497
498 if (starting_url_.indexOf(document_root_) >= 0 ){
499 return;
500 }
501
502 // open a url to the file written
503 URL u = new URL(starting_url_ + "externallinks");
504
505 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
506
507 external_links_ = new Hashtable();
508
509 String l = r.readLine();
510 // split the line of the space, first part is the image, second part the link
511 while (l != null) {
512
513 String tmp1 = new String();
514 String tmp2 = new String();
515
516 if (l.indexOf(" ") >= 0) {
517
518 tmp1 = l.substring(0, l.indexOf(" "));
519 if (l.length() > l.indexOf(" ") + 1)
520 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
521 else
522 tmp2 = null;
523
524 if (tmp2 != null) {
525 external_links_.put(tmp1, tmp2);
526 //System.err.println(tmp1 + " " + tmp2);
527 }
528 }
529 l = r.readLine();
530 }
531
532 r.close();
533
534 } catch (Exception e) {
535 e.printStackTrace();
536 return;
537 }
538 }
539
540 /** Controls the download thread */
541 public void run ()
542 {
543 System.err.println("Starting download thread.");
544 visited_url_ = new Hashtable();
545
546 rec_add_images(starting_url_,1);
547 download_images_.stopDownload();
548 System.err.println("Download thread finished.");
549 }
550}
Note: See TracBrowser for help on using the repository browser.