Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 6816

Last change on this file since 6816 was 6816, checked in by mdewsnip, 20 years ago
The GsdlCollageApplet: a classifier that displays a collage of the images in a collection. By Katrina Edgar (kde2).
Property svn:keywords set to `Author Date Id Revision`
File size: 14.9 KB

Line
1	package org.nzdl.gsdl.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	/**
9	* @author Katrina Edgar
10	* @author David Bainbridge
11	*
12	* Controls retrieval of images from the specified starting url. Follows appropriate
13	* links from this starting point, traversing in a tree-like state through several other
14	* pages. Filters images and links based on specified parameters. Also controls the quantity
15	* of downloading that occurs by restricting the number of downloaded images that are yet to
16	* be displayed to 10, and the total number of downloads allowed is also restricted by
17	* the applet application (to prevent downloading occur infinitely). */
18	public class DownloadUrls extends Thread {
19
20	/** Refers to applet */
21	GsdlCollageApplet app_ = null;
22	/** Refers to download thread */
23	DownloadImages download_images_ = null;
24
25	/** The address from which the application should start looking for images */
26	String starting_url_ = null;
27	/** Restricts links followed from the starting url to links that contain this string */
28	String href_musthave_ = null;
29	/** Restricts links followed from the starting url to links that do not contain this string.
30	* Also prevents image names from containing this string */
31	String image_mustnothave_ = null;
32	/** Ignore images whose names begin with this string */
33	String image_ignore_ = null;
34	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
35	String image_type_ = null;
36
37	/** A static delay used when attempting to download more images into a full downloading buffer */
38	final int delay_ = 1000;
39	/** The maximum number of images to have downloaded and not yet displayed */
40	final int buffer_size_ = 10;
41
42	/** Used in cases where the image maps to a url outside of it's original location.
43	* When used with Greenstone the collage images will refer to documents in the collections
44	* from which the images are sourced. When used individually, the images may be saved into
45	* a user directory and the pages they reference may be external hyperlinks. */
46	Hashtable external_links_ = null;
47
48	/** Records all urls which have already been examined */
49	Hashtable visited_url_ = null;
50	/** Determines whether there are still pages to examine and images to download */
51	boolean thread_running_ = true;
52
53	/** Constructor to initialise a download thread from which images are found,
54	* saves parameters into local variables for use within the class.
55	*
56	* @param app reference to the applet
57	* @param download_images class which stores the images retrieved in triplets
58	* @param starting_url the url from which the search for images should begin
59	* @param href_musthave restricts links to only those containing this string
60	* @param image_mustnothave restricts links and image names to only those that don't contain this string
61	* @param image_ignore restricts the beginning of image names
62	* @param image_type restricts the type of images included in the collage to those named */
63	public DownloadUrls(GsdlCollageApplet app,
64	DownloadImages download_images, String starting_url,
65	String href_musthave, String image_mustnothave,
66	String image_ignore, String image_type)
67	{
68	super("DownloadUrls");
69	app_ = app;
70	download_images_ = download_images;
71
72	starting_url_ = starting_url;
73	href_musthave_ = href_musthave;
74	image_mustnothave_ = image_mustnothave;
75	image_ignore_ = image_ignore;
76	image_type_ = image_type;
77	}
78
79	/** Determines whether or not a url has already been examined
80	*
81	* @param url_string the url to check
82	* @return true if the url has been visited, false if not */
83	public boolean already_visited(String url_string)
84	{
85	int hash_pos = url_string.indexOf("#");
86	if (hash_pos>0)
87	{
88	// strip off #anchor reference
89	url_string = url_string.substring(0,hash_pos);
90	}
91
92	// if the url has been visited before, return true
93	if (visited_url_.containsKey(url_string))
94	{
95	if (app_.verbosity() > 3)
96	{
97	System.err.println("Visited " + url_string + " Before!");
98	}
99	return true;
100	}
101
102	visited_url_.put(url_string,"visited");
103
104	return false;
105	}
106
107	/** Restricts the type of images that can be included in the collage
108	*
109	* @param url_string the url to check
110	* @return true if the image is of a specified type, false if not */
111	public boolean image_file_extension(String url_string)
112	{
113	// lower case comparisons
114	String url_lstring = url_string.toLowerCase();
115
116	if (image_type_ == null)
117	return true;
118
119	String tmp = image_type_;
120	String original_image_type_ = image_type_;
121
122	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
123
124	tmp = image_type_.substring(0, image_type_.indexOf("%"));
125	if (image_type_.length() > image_type_.indexOf("%") + 1)
126	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
127	else
128	image_type_ = null;
129
130	if (url_lstring.endsWith(tmp)) {
131	image_type_ = original_image_type_;
132	return true;
133	}
134	}
135
136	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
137	image_type_ = original_image_type_;
138	return true;
139	}
140
141	image_type_ = original_image_type_;
142	return false;
143	}
144
145	/** Restricts images to only those that satisfy several specified conditions
146	* regarding the content of the image name and url.
147	*
148	* @param url_string the url to check
149	* @return true if the image is satisfactory, false if not */
150	public boolean filter_image(String url_string)
151	{
152	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
153	{
154	if (!already_visited(url_string))
155	{
156	if (image_mustnothave_ != null) {
157
158	String tmp = image_mustnothave_;
159	String original_image_mustnothave_ = image_mustnothave_;
160
161	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
162
163	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
164	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
165	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
166	image_mustnothave_.length());
167	else
168	image_mustnothave_ = null;
169
170	if (url_string.indexOf(tmp) >= 0) {
171	image_mustnothave_ = original_image_mustnothave_;
172	return false;
173	}
174	}
175
176	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
177	image_mustnothave_ = original_image_mustnothave_;
178	return false;
179	}
180
181	if (app_.verbosity() > 2) {
182	System.err.println("src url = "+ url_string);
183	}
184
185	image_mustnothave_ = original_image_mustnothave_;
186	return true;
187	}
188	}
189	}
190
191	return false;
192	}
193
194	/** Restricts links to only those that satisfy several specified conditions
195	* regarding the address of the link.
196	*
197	* @param url_string the url to check
198	* @param new_url_string the url from which this link was found
199	* @param depth the number of links followed on this path
200	* @return true if the image is satisfactory, false if not */
201	public boolean filter_href(String url_string, String new_url_string, int depth)
202	{
203	boolean has_href = false;
204	String tmp = href_musthave_;
205	String original_href_musthave_ = href_musthave_;
206
207	// checks the it does contain this content
208	if (href_musthave_ != null) {
209
210	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
211
212	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
213	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
214	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
215	else
216	href_musthave_ = null;
217
218	if (url_string.indexOf(tmp) >= 0)
219	has_href = true;
220	}
221
222	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
223	has_href = true;
224
225	href_musthave_ = original_href_musthave_;
226	}
227
228	tmp = image_mustnothave_;
229	String original_image_mustnothave_ = image_mustnothave_;
230
231	// checks that it doesn't contain this content
232	if (image_mustnothave_ != null) {
233
234	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
235
236	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
237	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
238	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
239	else
240	image_mustnothave_ = null;
241
242	if (url_string.indexOf(tmp) >= 0)
243	has_href = false;
244	}
245	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
246	has_href = false;
247
248	image_mustnothave_ = original_image_mustnothave_;
249	}
250
251	// return true if the link is valid and false if not
252	if (href_musthave_==null \|\| has_href)
253	{
254	// might be another URL
255	if (depth < app_.maxDepth())
256	{
257	if (!new_url_string.startsWith(url_string))
258	{
259	return true;
260	}
261	}
262	}
263	return false;
264	}
265
266	/** Adds an image to the stored downloaded images as a triplet.
267	* Ensures that the number of images downloaded but not displayed at
268	* anyone time is controlled by using a buffer. If the buffer is
269	* full this function will wait until space becomes available before
270	* continuing. It also restricts the
271	* total number of images to download as specified by the applet.
272	*
273	* @param url the image to download
274	* @param from_url the url that this image was sourced from
275	* @param img_name the name of the image */
276	public void add_image(URL url, String from_url, String img_name)
277	{
278	try {
279
280	boolean had_to_wait = false;
281
282	// ensure that we don't download too many images
283	while (download_images_.size() >= buffer_size_)
284	{
285	had_to_wait = true;
286	Thread.sleep(delay_);
287	}
288
289	// get the image from the url
290	Image image = Toolkit.getDefaultToolkit().getImage(url);
291
292	// push image onto the downloaded images
293	download_images_.push(image,from_url, img_name);
294
295	// if have completed the maximum number of downloads for the
296	// application then stop
297	if (download_images_.size() == app_.maxDownloads())
298	{
299	stop();
300	}
301
302	}
303	catch (Exception e) {
304	thread_running_ = false;
305	stop();
306	e.printStackTrace();
307	}
308	}
309
310	/** Connects to the starting url and looks for all images and links from this
311	* original page. Image links are processed first, so that any images found can be
312	* downloaded immediately and placed on the applet. Secondly, the links to other
313	* pages are recursively processed by this function and treated as a starting url
314	*
315	* @param new_url the url from which to start searching for images and links
316	* @param depth the number of links that have been followed on this path */
317	public void rec_add_images(String new_url, int depth)
318	{
319	System.err.println("Parsing url = " + new_url);
320
321	if (already_visited(new_url)) return;
322
323	// check if there is a scenario where external hyperlinks are being used
324	externalLinks();
325	String img_name = new String();
326
327	// connect to the url
328	CURL curl = new CURL(new_url);
329	if (curl.connected_ok())
330	{
331	// read the page
332	curl.readAll();
333
334	// get all the <code><img src=</code> links into a vector
335	Vector src_links = curl.getSrcLinks();
336
337	// process each of the image links according to the parameters given.
338	for (int i = 0; i < src_links.size(); i++)
339	{
340	URL url = (URL)src_links.elementAt(i);
341	String url_string = url.toString();
342
343	if (image_file_extension(url_string))
344	{
345	if (filter_image(url_string))
346	{
347	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
348
349	if (external_links_ != null && ! external_links_.isEmpty()) {
350	String ext = (String) external_links_.get(img_name);
351
352	if (ext != null)
353	add_image(url, ext, img_name);
354	else
355	add_image(url, new_url, img_name);
356	}
357	else {
358	add_image(url, new_url, img_name);
359	}
360	}
361	}
362
363	}
364
365	// get all the <code><a href=</code> links into a vector
366	Vector href_links = curl.getHrefLinks();
367
368	// process each of the href links according to the parameters given.
369	for (int i = 0; i < href_links.size(); i++)
370	{
371	URL url = (URL)href_links.elementAt(i);
372	String url_string = url.toString();
373
374	if (image_file_extension(url_string))
375	{
376	if (filter_image(url_string))
377	{
378	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
379
380	if (external_links_ != null && ! external_links_.isEmpty()) {
381	String ext = (String) external_links_.get(img_name);
382
383	if (ext != null)
384	add_image(url, ext, img_name);
385	else
386	add_image(url, new_url, img_name);
387	}
388	else {
389	add_image(url, url_string, img_name);
390	}
391	}
392	}
393	else
394	{
395	if (filter_href(url_string,new_url,depth))
396	{
397	rec_add_images(url_string,depth+1);
398
399	}
400	}
401	}
402	}
403
404	else {
405	System.err.println("Unable able to download "+new_url);
406	}
407	}
408
409
410	/** Used in cases where the image maps to a url outside of it's original location.
411	* When used with Greenstone the collage images will refer to documents in the collections
412	* from which the images are sourced. When used individually, the images may be saved into
413	* a user directory and the pages they reference may be external hyperlinks.
414	* This function reads that external links file and creates a hash map of the image to
415	* its external hyperlink. If the file does not exist the download thread will continue
416	* and assume the first case, that links are internal. */
417	public void externalLinks() {
418
419	try {
420
421	if (starting_url_.indexOf("gsdl") >= 0) {
422	external_links_ = null;
423	return;
424	}
425
426	// open a url to the file written
427	URL u = new URL(starting_url_ + "externallinks");
428
429	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
430
431	external_links_ = new Hashtable();
432
433	String l = r.readLine();
434	// split the line of the space, first part is the image, second part the link
435	while (l != null) {
436
437	String tmp1 = new String();
438	String tmp2 = new String();
439
440	if (l.indexOf(" ") >= 0) {
441
442	tmp1 = l.substring(0, l.indexOf(" "));
443	if (l.length() > l.indexOf(" ") + 1)
444	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
445	else
446	tmp2 = null;
447
448	if (tmp2 != null) {
449	external_links_.put(tmp1, tmp2);
450	//System.err.println(tmp1 + " " + tmp2);
451	}
452	}
453	l = r.readLine();
454	}
455
456	r.close();
457
458	} catch (Exception e) {
459	e.printStackTrace();
460	return;
461	}
462	}
463
464	/** Controls the download thread */
465	public void run ()
466	{
467	System.err.println("Starting download thread.");
468	visited_url_ = new Hashtable();
469	rec_add_images(starting_url_,1);
470
471	System.err.println("Download thread finished.");
472	}
473	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: