Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11472

Last change on this file since 11472 was 11472, checked in by shaoqun, 18 years ago
set external_links_ = null if exception occurs
Property svn:keywords set to `Author Date Id Revision`
File size: 17.7 KB

Line
1	package org.nzdl.gsdl.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	// import javax.swing.ImageIcon; //****
9
10
11	/**
12	* @author Katrina Edgar
13	* @author David Bainbridge
14	*
15	* Controls retrieval of images from the specified starting url. Follows appropriate
16	* links from this starting point, traversing in a tree-like state through several other
17	* pages. Filters images and links based on specified parameters. Also controls the quantity
18	* of downloading that occurs by restricting the number of downloaded images that are yet to
19	* be displayed to 10, and the total number of downloads allowed is also restricted by
20	* the applet application (to prevent downloading occuring infinitely). */
21
22	public class DownloadUrls extends Thread {
23
24	/** Refers to applet */
25	GsdlCollageApplet app_ = null;
26	/** Refers to download thread */
27	DownloadImages download_images_ = null;
28
29	/** The address from which the application should start looking for images */
30	String starting_url_ = null;
31
32	/** the root directory of Greenstone*/
33	String document_root_ = null;
34
35
36	/** CHRIS - Holds the contents of the collection's assoc directory */
37	// File[] assocDir_ = null;
38
39	/** Restricts links followed from the starting url to links that contain this string */
40	String href_musthave_ = null;
41	/** Restricts links followed from the starting url to links that do not contain this string.
42	* Also prevents image names from containing this string */
43	String image_mustnothave_ = null;
44	/** Ignore images whose names begin with this string */
45	String image_ignore_ = null;
46	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47	String image_type_ = null;
48
49	/** A static delay used when attempting to download more images into a full downloading buffer */
50	final int delay_ = 1000;
51	/** The maximum number of images to have downloaded and not yet displayed */
52	final int buffer_size_ = 10;
53
54	/** Used in cases where the image maps to a url outside of it's original location.
55	* When used with Greenstone the collage images will refer to documents in the collections
56	* from which the images are sourced. When used individually, the images may be saved into
57	* a user directory and the pages they reference may be external hyperlinks. */
58	Hashtable external_links_ = null;
59
60	/** Records all urls which have already been examined */
61	Hashtable visited_url_ = null;
62	/** Determines whether there are still pages to examine and images to download */
63	boolean thread_running_ = true;
64
65	int verbosity_ = 0;
66
67	/** Constructor to initialise a download thread from which images are found,
68	* saves parameters into local variables for use within the class.
69	*
70	* @param app reference to the applet
71	* @param download_images class which stores the images retrieved in triplets
72	* @param starting_url the url from which the search for images should begin
73	* @param href_musthave restricts links to only those containing this string
74	* @param image_mustnothave restricts links and image names to only those that don't contain this string
75	* @param image_ignore restricts the beginning of image names
76	* @param image_type restricts the type of images included in the collage to those named */
77	public DownloadUrls(GsdlCollageApplet app,
78	DownloadImages download_images, String starting_url,
79	String href_musthave, String image_mustnothave,
80	String image_ignore, String image_type, String document_root,int verbosity)
81	{
82	super("DownloadUrls");
83	app_ = app;
84	download_images_ = download_images;
85
86	starting_url_ = starting_url;
87	href_musthave_ = href_musthave;
88	image_mustnothave_ = image_mustnothave;
89	image_ignore_ = image_ignore;
90	image_type_ = image_type;
91	document_root_ = document_root;
92	verbosity_ = verbosity;
93
94
95	System.err.println("starting_url_ " + starting_url +"\n"+
96	"href_musthave_ " + href_musthave +"\n"+
97	"image_mustnothave_" + image_mustnothave+"\n"+
98	"image_ignore_ "+ image_ignore+"\n"+
99	"image_type_ "+ image_type+"\n"+
100	"document root "+ document_root_
101	);
102
103
104
105
106	}
107
108
109
110	/** Determines whether or not a url has already been examined
111	*
112	* @param url_string the url to check
113	* @return true if the url has been visited, false if not */
114	public boolean already_visited(String url_string)
115	{
116	int hash_pos = url_string.indexOf("#");
117	if (hash_pos>0)
118	{
119	// strip off #anchor reference
120	url_string = url_string.substring(0,hash_pos);
121	}
122
123	// if the url has been visited before, return true
124	if (visited_url_.containsKey(url_string))
125	{
126	if (verbosity_ > 3)
127	{
128	System.err.println("Visited " + url_string + " before!");
129	}
130	return true;
131	}
132
133	visited_url_.put(url_string,"visited");
134
135	return false;
136	}
137
138	/** Restricts the type of images that can be included in the collage
139	*
140	* @param url_string the url to check
141	* @return true if the image is of a specified type, false if not */
142	public boolean image_file_extension(String url_string)
143	{
144	// lower case comparisons
145	String url_lstring = url_string.toLowerCase();
146
147	if (image_type_ == null)
148	return true;
149
150	String tmp = image_type_;
151	String original_image_type_ = image_type_;
152
153	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
154
155	tmp = image_type_.substring(0, image_type_.indexOf("%"));
156
157	if (image_type_.length() > image_type_.indexOf("%") + 1)
158	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
159	else
160	image_type_ = null;
161
162	if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
163	image_type_ = original_image_type_;
164	return true;
165	}
166	}
167
168	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
169	image_type_ = original_image_type_;
170	return true;
171	}
172
173	image_type_ = original_image_type_;
174	return false;
175	}
176
177	/** Restricts images to only those that satisfy several specified conditions
178	* regarding the content of the image name and url.
179	*
180	* @param url_string the url to check
181	* @return true if the image is satisfactory, false if not */
182	public boolean filter_image(String url_string)
183	{
184
185	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
186	{
187	if (!already_visited(url_string))
188	{
189	if (image_mustnothave_ != null) {
190
191	String tmp = image_mustnothave_;
192	String original_image_mustnothave_ = image_mustnothave_;
193
194	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
195
196	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
197	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
198	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
199	image_mustnothave_.length());
200	else
201	image_mustnothave_ = null;
202
203
204
205	if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
206
207	image_mustnothave_ = original_image_mustnothave_;
208	return false;
209	}
210	}
211
212	image_mustnothave_ = original_image_mustnothave_;
213
214	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
215	image_mustnothave_ = original_image_mustnothave_;
216	return false;
217	}
218
219	if (verbosity_ > 2) {
220	System.err.println("src url = "+ url_string);
221	}
222
223	image_mustnothave_ = original_image_mustnothave_;
224
225	}
226
227	}
228
229	}
230
231	return true;
232	}
233
234	/** Restricts links to only those that satisfy several specified conditions
235	* regarding the address of the link.
236	*
237	* @param url_string the url to check
238	* @param new_url_string the url from which this link was found
239	* @param depth the number of links followed on this path
240	* @return true if the image is satisfactory, false if not */
241	public boolean filter_href(String url_string, String new_url_string, int depth)
242	{
243	boolean has_href = false;
244	String tmp = href_musthave_;
245	String original_href_musthave_ = href_musthave_;
246
247	// checks that it does contain this content
248	if (href_musthave_ != null) {
249
250	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
251
252	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
253	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
254	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
255	else
256	href_musthave_ = null;
257
258	if (url_string.indexOf(tmp) >= 0)
259	has_href = true;
260	}
261
262	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
263	has_href = true;
264
265	href_musthave_ = original_href_musthave_;
266	}
267
268	tmp = image_mustnothave_;
269	String original_image_mustnothave_ = image_mustnothave_;
270
271	// checks that it doesn't contain this content
272	if (image_mustnothave_ != null) {
273
274	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
275
276	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
277	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
278	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
279	else
280	image_mustnothave_ = null;
281
282	if (url_string.indexOf(tmp) >= 0)
283	has_href = false;
284	}
285	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
286	has_href = false;
287
288	image_mustnothave_ = original_image_mustnothave_;
289	}
290
291	// return true if the link is valid and false if not
292	if (href_musthave_==null \|\| has_href)
293	{
294	// might be another URL
295	if (depth < app_.maxDepth())
296	{
297	if (!new_url_string.startsWith(url_string))
298	{
299	return true;
300	}
301	}
302	}
303	return false;
304	}
305
306	/** Adds an image to the stored downloaded images as a triplet.
307	* Ensures that the number of images downloaded but not displayed at
308	* any one time is controlled by using a buffer. If the buffer is
309	* full this function will wait until space becomes available before
310	* continuing. It also restricts the
311	* total number of images to download as specified by the applet.
312	*
313	* @param url the image to download
314	* @param from_url the url that this image was sourced from
315	* @param img_name the name of the image */
316	public void add_image(URL url, String from_url, String img_name)
317	{
318	try {
319
320	boolean had_to_wait = false;
321
322	// ensure that we don't download too many images
323	while (download_images_.size() >= buffer_size_)
324	{
325	had_to_wait = true;
326	Thread.sleep(delay_);
327	}
328
329	// get the image from the url
330	if (verbosity_>=2) {
331	System.err.println(" Downloading image URL: " + url.toString());
332	}
333
334	//ImageIcon image_icon = new ImageIcon(url);
335	// Image image = image_icon.getImage();
336
337	// Image image = Toolkit.getDefaultToolkit().createImage(url);
338	// Image image = app_.getImage(url);
339	Image image = Toolkit.getDefaultToolkit().getImage(url);
340
341	//System.err.println("###DownloadingED image URL: " + url.toString());
342
343	boolean status = app_.prepareImage(image,app_);
344	//System.err.println(" Prepare Image status = " + status);
345
346	Thread.sleep(100);
347
348	// push image onto the downloaded images
349	/* System.err.println("*** Pushing: name="+img_name
350	+" dimensions = "+image_icon.getIconWidth()+"x"+image_icon.getIconHeight());
351	// +" dimensions = "+image.getWidth(app_)+"x"+image.getHeight(app_));
352	*/
353
354	download_images_.push(image,from_url, img_name);
355
356	// if have completed the maximum number of downloads for the
357	// application then stop
358	if (download_images_.size() == app_.maxDownloads()) {
359	stop();
360	}
361
362	}
363	catch (Exception e) {
364	thread_running_ = false;
365	stop();
366	e.printStackTrace();
367	}
368	}
369
370	/** Connects to the starting url and looks for all images and links from this
371	* original page. Image links are processed first, so that any images found can be
372	* downloaded immediately and placed on the applet. Secondly, the links to other
373	* pages are recursively processed by this function and treated as a starting url
374	*
375	* @param new_url the url from which to start searching for images and links
376	* @param depth the number of links that have been followed on this path */
377	public void rec_add_images(String new_url, int depth)
378	{
379
380
381	System.err.println("Parsing url = " + new_url);
382
383	if (already_visited(new_url)) return;
384
385	// check if there is a scenario where external hyperlinks are being used
386	externalLinks();
387	String img_name = new String();
388
389	// connect to the url
390	CURL curl = new CURL(new_url);
391	if (curl.connected_ok())
392	{
393	if (verbosity_ >= 1) {
394	System.err.print("Connected OK ... ");
395	}
396
397	// read the page
398	curl.readAll();
399	if (verbosity_ >= 1) {
400	System.err.println("URL read.");
401	}
402
403	// get all the <code><img src=</code> links into a vector
404	Vector src_links = curl.getSrcLinks();
405
406	if (verbosity_ >= 2) {
407	System.err.println(" Got src links... there are " + src_links.size() + " of them.");
408	}
409	// process each of the image links according to the parameters given.
410	for (int i = 0; i < src_links.size(); i++)
411	{
412	URL url = (URL)src_links.elementAt(i);
413	String url_string = url.toString();
414
415	//System.err.println(" source links " + i + " [" + url_string +"]");
416
417	if (verbosity_ >= 3) {
418	System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
419	}
420
421	if (image_file_extension(url_string))
422	{
423	if (filter_image(url_string))
424	{
425	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
426
427	if (verbosity_ >= 2) {
428	System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
429	}
430
431	if ((external_links_ != null) && (!external_links_.isEmpty())) {
432	String ext = (String) external_links_.get(img_name);
433
434
435	if (ext != null){
436	add_image(url, ext, img_name);
437
438	}
439	else{
440
441	add_image(url, new_url, img_name);
442	}
443	}
444	else {
445
446	add_image(url, new_url, img_name);
447	}
448
449
450	}
451
452	}
453
454	}
455
456	// get all the <code><a href=</code> links into a vector
457	Vector href_links = curl.getHrefLinks();
458
459
460	if (verbosity_ >= 2) {
461	System.err.println(" Got href links... there are " + href_links.size() + " of them.");
462	}
463
464
465	// process each of the href links according to the parameters given.
466	for (int i = 0; i < href_links.size(); i++)
467	{
468	URL url = (URL)href_links.elementAt(i);
469	String url_string = url.toString();
470
471	//System.err.println(" href links " + i + "[" + url_string +"]");
472
473
474	if (image_file_extension(url_string))
475	{
476
477	if (filter_image(url_string))
478
479	{
480	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
481
482	if ((external_links_ != null) && (!external_links_.isEmpty())) {
483	String ext = (String) external_links_.get(img_name);
484
485	if (ext != null)
486	add_image(url, ext, img_name);
487	else
488	add_image(url, new_url, img_name);
489	}
490	else {
491	add_image(url, url_string, img_name);
492	}
493	}
494	}
495	else
496	{
497	if (filter_href(url_string,new_url,depth))
498	{
499
500	System.out.println("*************************************");
501	rec_add_images(url_string,depth+1);
502
503	}
504	}
505	}
506	}
507
508	else {
509	System.err.println("Unable able to download "+new_url);
510	}
511	}
512
513
514	/** Used in cases where the image maps to a url outside of it's original location.
515	* When used with Greenstone the collage images will refer to documents in the collections
516	* from which the images are sourced. When used individually, the images may be saved into
517	* a user directory and the pages they reference may be external hyperlinks.
518	* This function reads that external links file and creates a hash map of the image to
519	* its external hyperlink. If the file does not exist the download thread will continue
520	* and assume the first case, that links are internal. */
521	public void externalLinks() {
522	external_links_ = null;
523
524	try {
525
526	if (starting_url_.indexOf(document_root_) >= 0 ){
527	return;
528	}
529
530	// open a url to the file written
531	URL u = new URL(starting_url_ + "externallinks");
532
533	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
534
535	external_links_ = new Hashtable();
536
537	String l = r.readLine();
538	// split the line of the space, first part is the image, second part the link
539	while (l != null) {
540
541	String tmp1 = new String();
542	String tmp2 = new String();
543
544	if (l.indexOf(" ") >= 0) {
545
546	tmp1 = l.substring(0, l.indexOf(" "));
547	if (l.length() > l.indexOf(" ") + 1)
548	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
549	else
550	tmp2 = null;
551
552	if (tmp2 != null) {
553	external_links_.put(tmp1, tmp2);
554	//System.err.println(tmp1 + " " + tmp2);
555	}
556	}
557	l = r.readLine();
558	}
559
560	r.close();
561
562	} catch (Exception e) {
563	e.printStackTrace();
564	return;
565	}
566	}
567
568	/** Controls the download thread */
569	public void run ()
570	{
571	System.err.println("Starting download thread.");
572	visited_url_ = new Hashtable();
573
574	rec_add_images(starting_url_,1);
575
576	System.err.println("Download thread finished.");
577	}
578	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: