Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11563

Last change on this file since 11563 was 11563, checked in by shaoqun, 18 years ago
changed the code handling threads to make it threads safe
Property svn:keywords set to `Author Date Id Revision`
File size: 17.3 KB

Line
1	package org.nzdl.gsdl.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	import javax.swing.ImageIcon; //****
9
10
11	/**
12	* @author Katrina Edgar
13	* @author David Bainbridge
14	*
15	* Controls retrieval of images from the specified starting url. Follows appropriate
16	* links from this starting point, traversing in a tree-like state through several other
17	* pages. Filters images and links based on specified parameters. Also controls the quantity
18	* of downloading that occurs by restricting the number of downloaded images that are yet to
19	* be displayed to 10, and the total number of downloads allowed is also restricted by
20	* the applet application (to prevent downloading occuring infinitely). */
21
22	public class DownloadUrls extends Thread {
23
24	/** Refers to applet */
25	GsdlCollageApplet app_ = null;
26	/** Refers to download thread */
27	DownloadImages download_images_ = null;
28
29	/** The address from which the application should start looking for images */
30	String starting_url_ = null;
31
32	/** the root directory of Greenstone*/
33	String document_root_ = null;
34
35
36	/** CHRIS - Holds the contents of the collection's assoc directory */
37	// File[] assocDir_ = null;
38
39	/** Restricts links followed from the starting url to links that contain this string */
40	String href_musthave_ = null;
41	/** Restricts links followed from the starting url to links that do not contain this string.
42	* Also prevents image names from containing this string */
43	String image_mustnothave_ = null;
44	/** Ignore images whose names begin with this string */
45	String image_ignore_ = null;
46	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47	String image_type_ = null;
48
49	/** A static delay used when attempting to download more images into a full downloading buffer */
50	final int delay_ = 3000;
51	/** The maximum number of images to have downloaded and not yet displayed */
52	final int buffer_size_ = 1;
53
54	/** Used in cases where the image maps to a url outside of it's original location.
55	* When used with Greenstone the collage images will refer to documents in the collections
56	* from which the images are sourced. When used individually, the images may be saved into
57	* a user directory and the pages they reference may be external hyperlinks. */
58	Hashtable external_links_ = null;
59
60	/** Records all urls which have already been examined */
61	Hashtable visited_url_ = null;
62	/** Determines whether there are still pages to examine and images to download */
63	boolean thread_running_ = true;
64
65	int verbosity_ = 0;
66
67	protected boolean busy_ = false;
68
69	MediaTracker tracker;
70
71	/** Constructor to initialise a download thread from which images are found,
72	* saves parameters into local variables for use within the class.
73	*
74	* @param app reference to the applet
75	* @param download_images class which stores the images retrieved in triplets
76	* @param starting_url the url from which the search for images should begin
77	* @param href_musthave restricts links to only those containing this string
78	* @param image_mustnothave restricts links and image names to only those that don't contain this string
79	* @param image_ignore restricts the beginning of image names
80	* @param image_type restricts the type of images included in the collage to those named */
81	public DownloadUrls(GsdlCollageApplet app,
82	DownloadImages download_images, String starting_url,
83	String href_musthave, String image_mustnothave,
84	String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
85	{
86	super("DownloadUrls");
87	app_ = app;
88	download_images_ = download_images;
89
90	starting_url_ = starting_url;
91	href_musthave_ = href_musthave;
92	image_mustnothave_ = image_mustnothave;
93	image_ignore_ = image_ignore;
94	image_type_ = image_type;
95	document_root_ = document_root;
96	verbosity_ = verbosity;
97	tracker = trk;
98
99	System.err.println("starting_url_ " + starting_url +"\n"+
100	"href_musthave_ " + href_musthave +"\n"+
101	"image_mustnothave_" + image_mustnothave+"\n"+
102	"image_ignore_ "+ image_ignore+"\n"+
103	"image_type_ "+ image_type+"\n"+
104	"document root "+ document_root_
105	);
106
107
108
109
110	}
111
112	public boolean getStatus(){
113	return busy_;
114
115	}
116
117	/** Determines whether or not a url has already been examined
118	*
119	* @param url_string the url to check
120	* @return true if the url has been visited, false if not */
121	public boolean already_visited(String url_string)
122	{
123	int hash_pos = url_string.indexOf("#");
124	if (hash_pos>0)
125	{
126	// strip off #anchor reference
127	url_string = url_string.substring(0,hash_pos);
128	}
129
130	// if the url has been visited before, return true
131	if (visited_url_.containsKey(url_string))
132	{
133	if (verbosity_ > 3)
134	{
135	System.err.println("Visited " + url_string + " before!");
136	}
137	return true;
138	}
139
140	visited_url_.put(url_string,"visited");
141
142	return false;
143	}
144
145	/** Restricts the type of images that can be included in the collage
146	*
147	* @param url_string the url to check
148	* @return true if the image is of a specified type, false if not */
149	public boolean image_file_extension(String url_string)
150	{
151	// lower case comparisons
152	String url_lstring = url_string.toLowerCase();
153
154	if (image_type_ == null)
155	return true;
156
157	String tmp = image_type_;
158	String original_image_type_ = image_type_;
159
160	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
161
162	tmp = image_type_.substring(0, image_type_.indexOf("%"));
163
164	if (image_type_.length() > image_type_.indexOf("%") + 1)
165	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
166	else
167	image_type_ = null;
168
169	if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
170	image_type_ = original_image_type_;
171	return true;
172	}
173	}
174
175	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
176	image_type_ = original_image_type_;
177	return true;
178	}
179
180	image_type_ = original_image_type_;
181	return false;
182	}
183
184	/** Restricts images to only those that satisfy several specified conditions
185	* regarding the content of the image name and url.
186	*
187	* @param url_string the url to check
188	* @return true if the image is satisfactory, false if not */
189	public boolean filter_image(String url_string)
190	{
191
192	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
193	{
194	if (!already_visited(url_string))
195	{
196	if (image_mustnothave_ != null) {
197
198	String tmp = image_mustnothave_;
199	String original_image_mustnothave_ = image_mustnothave_;
200
201	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
202
203	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
204	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
205	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
206	image_mustnothave_.length());
207	else
208	image_mustnothave_ = null;
209
210
211
212	if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
213
214	image_mustnothave_ = original_image_mustnothave_;
215	return false;
216	}
217	}
218
219	image_mustnothave_ = original_image_mustnothave_;
220
221	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
222	image_mustnothave_ = original_image_mustnothave_;
223	return false;
224	}
225
226	if (verbosity_ > 2) {
227	System.err.println("src url = "+ url_string);
228	}
229
230	image_mustnothave_ = original_image_mustnothave_;
231
232	}
233
234	}
235
236	}
237
238	return true;
239	}
240
241	/** Restricts links to only those that satisfy several specified conditions
242	* regarding the address of the link.
243	*
244	* @param url_string the url to check
245	* @param new_url_string the url from which this link was found
246	* @param depth the number of links followed on this path
247	* @return true if the image is satisfactory, false if not */
248	public boolean filter_href(String url_string, String new_url_string, int depth)
249	{
250	boolean has_href = false;
251	String tmp = href_musthave_;
252	String original_href_musthave_ = href_musthave_;
253
254	// checks that it does contain this content
255	if (href_musthave_ != null) {
256
257	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
258
259	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
260	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
261	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
262	else
263	href_musthave_ = null;
264
265	if (url_string.indexOf(tmp) >= 0)
266	has_href = true;
267	}
268
269	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
270	has_href = true;
271
272	href_musthave_ = original_href_musthave_;
273	}
274
275	tmp = image_mustnothave_;
276	String original_image_mustnothave_ = image_mustnothave_;
277
278	// checks that it doesn't contain this content
279	if (image_mustnothave_ != null) {
280
281	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
282
283	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
284	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
285	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
286	else
287	image_mustnothave_ = null;
288
289	if (url_string.indexOf(tmp) >= 0)
290	has_href = false;
291	}
292	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
293	has_href = false;
294
295	image_mustnothave_ = original_image_mustnothave_;
296	}
297
298	// return true if the link is valid and false if not
299	if (href_musthave_==null \|\| has_href)
300	{
301	// might be another URL
302	if (depth < app_.maxDepth())
303	{
304	if (!new_url_string.startsWith(url_string))
305	{
306	return true;
307	}
308	}
309	}
310	return false;
311	}
312
313	/** Adds an image to the stored downloaded images as a triplet.
314	* Ensures that the number of images downloaded but not displayed at
315	* any one time is controlled by using a buffer. If the buffer is
316	* full this function will wait until space becomes available before
317	* continuing. It also restricts the
318	* total number of images to download as specified by the applet.
319	*
320	* @param url the image to download
321	* @param from_url the url that this image was sourced from
322	* @param img_name the name of the image */
323	public void add_image(URL url, String from_url, String img_name)
324	{
325	// get the image from the url
326	if (verbosity_>=2) {
327	//System.err.println(" ****Downloading image URL: " + url.toString());
328	}
329
330	int size = download_images_.downloadImage(tracker,url, from_url, img_name);
331
332	try{
333	// if have completed the maximum number of downloads for the
334	// application then stop
335	if (size == app_.maxDownloads()) {
336	stop();
337	}
338
339	}
340	catch (Exception e) {
341	thread_running_ = false;
342	stop();
343	e.printStackTrace();
344	}
345	}
346
347	/** Connects to the starting url and looks for all images and links from this
348	* original page. Image links are processed first, so that any images found can be
349	* downloaded immediately and placed on the applet. Secondly, the links to other
350	* pages are recursively processed by this function and treated as a starting url
351	*
352	* @param new_url the url from which to start searching for images and links
353	* @param depth the number of links that have been followed on this path */
354	public void rec_add_images(String new_url, int depth)
355	{
356
357	if (already_visited(new_url)) return;
358
359	// check if there is a scenario where external hyperlinks are being used
360	externalLinks();
361	String img_name = new String();
362
363	// connect to the url
364	CURL curl = new CURL(new_url);
365	if (curl.connected_ok())
366	{
367	if (verbosity_ >= 1) {
368	System.err.print("Connected OK ... ");
369	}
370
371	// read the page
372	curl.readAll();
373	if (verbosity_ >= 1) {
374	System.err.println("URL read.");
375	}
376
377	// get all the <code><img src=</code> links into a vector
378	Vector src_links = curl.getSrcLinks();
379
380	if (verbosity_ >= 2) {
381	System.err.println(" Got src links... there are " + src_links.size() + " of them.");
382	}
383	// process each of the image links according to the parameters given.
384	for (int i = 0; i < src_links.size(); i++)
385	{
386	URL url = (URL)src_links.get(i);
387	String url_string = url.toString();
388
389	//System.err.println(" source links " + i + " [" + url_string +"]");
390
391	if (verbosity_ >= 4) {
392	System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
393	}
394
395	if (image_file_extension(url_string))
396	{
397	if (filter_image(url_string))
398	{
399	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
400
401	if (verbosity_ >= 2) {
402	System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
403	}
404
405	if ((external_links_ != null) && (!external_links_.isEmpty())) {
406	String ext = (String) external_links_.get(img_name);
407
408
409	if (ext != null){
410	add_image(url, ext, img_name);
411
412	}
413	else{
414
415	add_image(url, new_url, img_name);
416	}
417	}
418	else {
419
420	add_image(url, new_url, img_name);
421	}
422
423
424	}
425
426	}
427
428	}
429
430	// get all the <code><a href=</code> links into a vector
431	Vector href_links = curl.getHrefLinks();
432
433
434	if (verbosity_ >= 2) {
435	System.err.println(" Got href links... there are " + href_links.size() + " of them.");
436	}
437
438
439	// process each of the href links according to the parameters given.
440	for (int i = 0; i < href_links.size(); i++)
441	{
442	URL url = (URL)href_links.get(i);
443	String url_string = url.toString();
444
445	//System.err.println(" href links " + i + "[" + url_string +"]");
446
447
448	if (image_file_extension(url_string))
449	{
450
451	if (filter_image(url_string))
452
453	{
454	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
455
456	if ((external_links_ != null) && (!external_links_.isEmpty())) {
457	String ext = (String) external_links_.get(img_name);
458
459	if (ext != null)
460	add_image(url, ext, img_name);
461	else
462	add_image(url, new_url, img_name);
463	}
464	else {
465	add_image(url, url_string, img_name);
466	}
467	}
468	}
469	else
470	{
471	if (filter_href(url_string,new_url,depth))
472	{
473	rec_add_images(url_string,depth+1);
474
475	}
476	}
477	}
478	}
479
480	else {
481	System.err.println("Unable able to download "+new_url);
482	}
483	}
484
485
486	/** Used in cases where the image maps to a url outside of it's original location.
487	* When used with Greenstone the collage images will refer to documents in the collections
488	* from which the images are sourced. When used individually, the images may be saved into
489	* a user directory and the pages they reference may be external hyperlinks.
490	* This function reads that external links file and creates a hash map of the image to
491	* its external hyperlink. If the file does not exist the download thread will continue
492	* and assume the first case, that links are internal. */
493	public void externalLinks() {
494	external_links_ = null;
495
496	try {
497
498	if (starting_url_.indexOf(document_root_) >= 0 ){
499	return;
500	}
501
502	// open a url to the file written
503	URL u = new URL(starting_url_ + "externallinks");
504
505	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
506
507	external_links_ = new Hashtable();
508
509	String l = r.readLine();
510	// split the line of the space, first part is the image, second part the link
511	while (l != null) {
512
513	String tmp1 = new String();
514	String tmp2 = new String();
515
516	if (l.indexOf(" ") >= 0) {
517
518	tmp1 = l.substring(0, l.indexOf(" "));
519	if (l.length() > l.indexOf(" ") + 1)
520	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
521	else
522	tmp2 = null;
523
524	if (tmp2 != null) {
525	external_links_.put(tmp1, tmp2);
526	//System.err.println(tmp1 + " " + tmp2);
527	}
528	}
529	l = r.readLine();
530	}
531
532	r.close();
533
534	} catch (Exception e) {
535	e.printStackTrace();
536	return;
537	}
538	}
539
540	/** Controls the download thread */
541	public void run ()
542	{
543	System.err.println("Starting download thread.");
544	visited_url_ = new Hashtable();
545
546	rec_add_images(starting_url_,1);
547	download_images_.stopDownload();
548	System.err.println("Download thread finished.");
549	}
550	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: