source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 6816

Last change on this file since 6816 was 6816, checked in by mdewsnip, 20 years ago

The GsdlCollageApplet: a classifier that displays a collage of the images in a collection. By Katrina Edgar (kde2).

  • Property svn:keywords set to Author Date Id Revision
File size: 14.9 KB
Line 
1package org.nzdl.gsdl.GsdlCollageApplet;
2
3import java.awt.*;
4import java.io.*;
5import java.net.*;
6import java.util.*;
7
8/**
9 * @author Katrina Edgar
10 * @author David Bainbridge
11 *
12 * Controls retrieval of images from the specified starting url. Follows appropriate
13 * links from this starting point, traversing in a tree-like state through several other
14 * pages. Filters images and links based on specified parameters. Also controls the quantity
15 * of downloading that occurs by restricting the number of downloaded images that are yet to
16 * be displayed to 10, and the total number of downloads allowed is also restricted by
17 * the applet application (to prevent downloading occur infinitely). */
18public class DownloadUrls extends Thread {
19
20 /** Refers to applet */
21 GsdlCollageApplet app_ = null;
22 /** Refers to download thread */
23 DownloadImages download_images_ = null;
24
25 /** The address from which the application should start looking for images */
26 String starting_url_ = null;
27 /** Restricts links followed from the starting url to links that contain this string */
28 String href_musthave_ = null;
29 /** Restricts links followed from the starting url to links that do not contain this string.
30 * Also prevents image names from containing this string */
31 String image_mustnothave_ = null;
32 /** Ignore images whose names begin with this string */
33 String image_ignore_ = null;
34 /** Restricts the types of images included in the collage, for example jpg, gif, etc. */
35 String image_type_ = null;
36
37 /** A static delay used when attempting to download more images into a full downloading buffer */
38 final int delay_ = 1000;
39 /** The maximum number of images to have downloaded and not yet displayed */
40 final int buffer_size_ = 10;
41
42 /** Used in cases where the image maps to a url outside of it's original location.
43 * When used with Greenstone the collage images will refer to documents in the collections
44 * from which the images are sourced. When used individually, the images may be saved into
45 * a user directory and the pages they reference may be external hyperlinks. */
46 Hashtable external_links_ = null;
47
48 /** Records all urls which have already been examined */
49 Hashtable visited_url_ = null;
50 /** Determines whether there are still pages to examine and images to download */
51 boolean thread_running_ = true;
52
53 /** Constructor to initialise a download thread from which images are found,
54 * saves parameters into local variables for use within the class.
55 *
56 * @param app reference to the applet
57 * @param download_images class which stores the images retrieved in triplets
58 * @param starting_url the url from which the search for images should begin
59 * @param href_musthave restricts links to only those containing this string
60 * @param image_mustnothave restricts links and image names to only those that don't contain this string
61 * @param image_ignore restricts the beginning of image names
62 * @param image_type restricts the type of images included in the collage to those named */
63 public DownloadUrls(GsdlCollageApplet app,
64 DownloadImages download_images, String starting_url,
65 String href_musthave, String image_mustnothave,
66 String image_ignore, String image_type)
67 {
68 super("DownloadUrls");
69 app_ = app;
70 download_images_ = download_images;
71
72 starting_url_ = starting_url;
73 href_musthave_ = href_musthave;
74 image_mustnothave_ = image_mustnothave;
75 image_ignore_ = image_ignore;
76 image_type_ = image_type;
77 }
78
79 /** Determines whether or not a url has already been examined
80 *
81 * @param url_string the url to check
82 * @return true if the url has been visited, false if not */
83 public boolean already_visited(String url_string)
84 {
85 int hash_pos = url_string.indexOf("#");
86 if (hash_pos>0)
87 {
88 // strip off #anchor reference
89 url_string = url_string.substring(0,hash_pos);
90 }
91
92 // if the url has been visited before, return true
93 if (visited_url_.containsKey(url_string))
94 {
95 if (app_.verbosity() > 3)
96 {
97 System.err.println("Visited " + url_string + " Before!");
98 }
99 return true;
100 }
101
102 visited_url_.put(url_string,"visited");
103
104 return false;
105 }
106
107 /** Restricts the type of images that can be included in the collage
108 *
109 * @param url_string the url to check
110 * @return true if the image is of a specified type, false if not */
111 public boolean image_file_extension(String url_string)
112 {
113 // lower case comparisons
114 String url_lstring = url_string.toLowerCase();
115
116 if (image_type_ == null)
117 return true;
118
119 String tmp = image_type_;
120 String original_image_type_ = image_type_;
121
122 while (image_type_ != null && image_type_.indexOf("%") >= 0) {
123
124 tmp = image_type_.substring(0, image_type_.indexOf("%"));
125 if (image_type_.length() > image_type_.indexOf("%") + 1)
126 image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
127 else
128 image_type_ = null;
129
130 if (url_lstring.endsWith(tmp)) {
131 image_type_ = original_image_type_;
132 return true;
133 }
134 }
135
136 if (image_type_ != null && url_lstring.endsWith(image_type_)) {
137 image_type_ = original_image_type_;
138 return true;
139 }
140
141 image_type_ = original_image_type_;
142 return false;
143 }
144
145 /** Restricts images to only those that satisfy several specified conditions
146 * regarding the content of the image name and url.
147 *
148 * @param url_string the url to check
149 * @return true if the image is satisfactory, false if not */
150 public boolean filter_image(String url_string)
151 {
152 if (image_ignore_==null || !url_string.startsWith(image_ignore_))
153 {
154 if (!already_visited(url_string))
155 {
156 if (image_mustnothave_ != null) {
157
158 String tmp = image_mustnothave_;
159 String original_image_mustnothave_ = image_mustnothave_;
160
161 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
162
163 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
164 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
165 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
166 image_mustnothave_.length());
167 else
168 image_mustnothave_ = null;
169
170 if (url_string.indexOf(tmp) >= 0) {
171 image_mustnothave_ = original_image_mustnothave_;
172 return false;
173 }
174 }
175
176 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
177 image_mustnothave_ = original_image_mustnothave_;
178 return false;
179 }
180
181 if (app_.verbosity() > 2) {
182 System.err.println("src url = "+ url_string);
183 }
184
185 image_mustnothave_ = original_image_mustnothave_;
186 return true;
187 }
188 }
189 }
190
191 return false;
192 }
193
194 /** Restricts links to only those that satisfy several specified conditions
195 * regarding the address of the link.
196 *
197 * @param url_string the url to check
198 * @param new_url_string the url from which this link was found
199 * @param depth the number of links followed on this path
200 * @return true if the image is satisfactory, false if not */
201 public boolean filter_href(String url_string, String new_url_string, int depth)
202 {
203 boolean has_href = false;
204 String tmp = href_musthave_;
205 String original_href_musthave_ = href_musthave_;
206
207 // checks the it does contain this content
208 if (href_musthave_ != null) {
209
210 while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
211
212 tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
213 if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
214 href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
215 else
216 href_musthave_ = null;
217
218 if (url_string.indexOf(tmp) >= 0)
219 has_href = true;
220 }
221
222 if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
223 has_href = true;
224
225 href_musthave_ = original_href_musthave_;
226 }
227
228 tmp = image_mustnothave_;
229 String original_image_mustnothave_ = image_mustnothave_;
230
231 // checks that it doesn't contain this content
232 if (image_mustnothave_ != null) {
233
234 while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
235
236 tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
237 if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
238 image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
239 else
240 image_mustnothave_ = null;
241
242 if (url_string.indexOf(tmp) >= 0)
243 has_href = false;
244 }
245 if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
246 has_href = false;
247
248 image_mustnothave_ = original_image_mustnothave_;
249 }
250
251 // return true if the link is valid and false if not
252 if (href_musthave_==null || has_href)
253 {
254 // might be another URL
255 if (depth < app_.maxDepth())
256 {
257 if (!new_url_string.startsWith(url_string))
258 {
259 return true;
260 }
261 }
262 }
263 return false;
264 }
265
266 /** Adds an image to the stored downloaded images as a triplet.
267 * Ensures that the number of images downloaded but not displayed at
268 * anyone time is controlled by using a buffer. If the buffer is
269 * full this function will wait until space becomes available before
270 * continuing. It also restricts the
271 * total number of images to download as specified by the applet.
272 *
273 * @param url the image to download
274 * @param from_url the url that this image was sourced from
275 * @param img_name the name of the image */
276 public void add_image(URL url, String from_url, String img_name)
277 {
278 try {
279
280 boolean had_to_wait = false;
281
282 // ensure that we don't download too many images
283 while (download_images_.size() >= buffer_size_)
284 {
285 had_to_wait = true;
286 Thread.sleep(delay_);
287 }
288
289 // get the image from the url
290 Image image = Toolkit.getDefaultToolkit().getImage(url);
291
292 // push image onto the downloaded images
293 download_images_.push(image,from_url, img_name);
294
295 // if have completed the maximum number of downloads for the
296 // application then stop
297 if (download_images_.size() == app_.maxDownloads())
298 {
299 stop();
300 }
301
302 }
303 catch (Exception e) {
304 thread_running_ = false;
305 stop();
306 e.printStackTrace();
307 }
308 }
309
310 /** Connects to the starting url and looks for all images and links from this
311 * original page. Image links are processed first, so that any images found can be
312 * downloaded immediately and placed on the applet. Secondly, the links to other
313 * pages are recursively processed by this function and treated as a starting url
314 *
315 * @param new_url the url from which to start searching for images and links
316 * @param depth the number of links that have been followed on this path */
317 public void rec_add_images(String new_url, int depth)
318 {
319 System.err.println("Parsing url = " + new_url);
320
321 if (already_visited(new_url)) return;
322
323 // check if there is a scenario where external hyperlinks are being used
324 externalLinks();
325 String img_name = new String();
326
327 // connect to the url
328 CURL curl = new CURL(new_url);
329 if (curl.connected_ok())
330 {
331 // read the page
332 curl.readAll();
333
334 // get all the <code><img src=</code> links into a vector
335 Vector src_links = curl.getSrcLinks();
336
337 // process each of the image links according to the parameters given.
338 for (int i = 0; i < src_links.size(); i++)
339 {
340 URL url = (URL)src_links.elementAt(i);
341 String url_string = url.toString();
342
343 if (image_file_extension(url_string))
344 {
345 if (filter_image(url_string))
346 {
347 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
348
349 if (external_links_ != null && ! external_links_.isEmpty()) {
350 String ext = (String) external_links_.get(img_name);
351
352 if (ext != null)
353 add_image(url, ext, img_name);
354 else
355 add_image(url, new_url, img_name);
356 }
357 else {
358 add_image(url, new_url, img_name);
359 }
360 }
361 }
362
363 }
364
365 // get all the <code><a href=</code> links into a vector
366 Vector href_links = curl.getHrefLinks();
367
368 // process each of the href links according to the parameters given.
369 for (int i = 0; i < href_links.size(); i++)
370 {
371 URL url = (URL)href_links.elementAt(i);
372 String url_string = url.toString();
373
374 if (image_file_extension(url_string))
375 {
376 if (filter_image(url_string))
377 {
378 img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
379
380 if (external_links_ != null && ! external_links_.isEmpty()) {
381 String ext = (String) external_links_.get(img_name);
382
383 if (ext != null)
384 add_image(url, ext, img_name);
385 else
386 add_image(url, new_url, img_name);
387 }
388 else {
389 add_image(url, url_string, img_name);
390 }
391 }
392 }
393 else
394 {
395 if (filter_href(url_string,new_url,depth))
396 {
397 rec_add_images(url_string,depth+1);
398
399 }
400 }
401 }
402 }
403
404 else {
405 System.err.println("Unable able to download "+new_url);
406 }
407 }
408
409
410 /** Used in cases where the image maps to a url outside of it's original location.
411 * When used with Greenstone the collage images will refer to documents in the collections
412 * from which the images are sourced. When used individually, the images may be saved into
413 * a user directory and the pages they reference may be external hyperlinks.
414 * This function reads that external links file and creates a hash map of the image to
415 * its external hyperlink. If the file does not exist the download thread will continue
416 * and assume the first case, that links are internal. */
417 public void externalLinks() {
418
419 try {
420
421 if (starting_url_.indexOf("gsdl") >= 0) {
422 external_links_ = null;
423 return;
424 }
425
426 // open a url to the file written
427 URL u = new URL(starting_url_ + "externallinks");
428
429 BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
430
431 external_links_ = new Hashtable();
432
433 String l = r.readLine();
434 // split the line of the space, first part is the image, second part the link
435 while (l != null) {
436
437 String tmp1 = new String();
438 String tmp2 = new String();
439
440 if (l.indexOf(" ") >= 0) {
441
442 tmp1 = l.substring(0, l.indexOf(" "));
443 if (l.length() > l.indexOf(" ") + 1)
444 tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
445 else
446 tmp2 = null;
447
448 if (tmp2 != null) {
449 external_links_.put(tmp1, tmp2);
450 //System.err.println(tmp1 + " " + tmp2);
451 }
452 }
453 l = r.readLine();
454 }
455
456 r.close();
457
458 } catch (Exception e) {
459 e.printStackTrace();
460 return;
461 }
462 }
463
464 /** Controls the download thread */
465 public void run ()
466 {
467 System.err.println("Starting download thread.");
468 visited_url_ = new Hashtable();
469 rec_add_images(starting_url_,1);
470
471 System.err.println("Download thread finished.");
472 }
473}
Note: See TracBrowser for help on using the repository browser.