Context Navigation

source: tags/gsdl-2_70u-distribution/gsdl/src/java/org/nzdl/gsdl/GsdlCollageApplet/DownloadUrls.java@ 11745

Last change on this file since 11745 was 11715, checked in by kjdon, 18 years ago
committed Shaoquns version of the applet for the branch - version 1.7
Property svn:keywords set to `Author Date Id Revision`
File size: 18.7 KB

Line
1	package org.nzdl.gsdl.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	import javax.swing.ImageIcon; //****
9
10
11	/**
12	* @author Katrina Edgar
13	* @author David Bainbridge
14	*
15	* Controls retrieval of images from the specified starting url. Follows appropriate
16	* links from this starting point, traversing in a tree-like state through several other
17	* pages. Filters images and links based on specified parameters. Also controls the quantity
18	* of downloading that occurs by restricting the number of downloaded images that are yet to
19	* be displayed to 10, and the total number of downloads allowed is also restricted by
20	* the applet application (to prevent downloading occuring infinitely). */
21
22	public class DownloadUrls extends Thread {
23
24	/** Refers to applet */
25	GsdlCollageApplet app_ = null;
26	/** Refers to download thread */
27	DownloadImages download_images_ = null;
28
29	/** The address from which the application should start looking for images */
30	String starting_url_ = null;
31
32	/** the root directory of Greenstone*/
33	String document_root_ = null;
34
35
36	/** CHRIS - Holds the contents of the collection's assoc directory */
37	// File[] assocDir_ = null;
38
39	/** Restricts links followed from the starting url to links that contain this string */
40	String href_musthave_ = null;
41	/** Restricts links followed from the starting url to links that do not contain this string.
42	* Also prevents image names from containing this string */
43	String image_mustnothave_ = null;
44	/** Ignore images whose names begin with this string */
45	String image_ignore_ = null;
46	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
47	String image_type_ = null;
48
49	/** A static delay used when attempting to download more images into a full downloading buffer */
50	final int delay_ = 3000;
51	/** The maximum number of images to have downloaded and not yet displayed */
52	final int buffer_size_ = 1;
53
54	/** Used in cases where the image maps to a url outside of it's original location.
55	* When used with Greenstone the collage images will refer to documents in the collections
56	* from which the images are sourced. When used individually, the images may be saved into
57	* a user directory and the pages they reference may be external hyperlinks. */
58	Hashtable external_links_ = null;
59
60	/** Records all urls which have already been examined */
61	Hashtable visited_url_ = null;
62	/** Determines whether there are still pages to examine and images to download */
63	boolean thread_running_ = true;
64
65	int verbosity_ = 0;
66
67	/** Records all images which have already been examined */
68	Hashtable visited_images_ = null;
69
70	MediaTracker tracker;
71
72	/** Constructor to initialise a download thread from which images are found,
73	* saves parameters into local variables for use within the class.
74	*
75	* @param app reference to the applet
76	* @param download_images class which stores the images retrieved in triplets
77	* @param starting_url the url from which the search for images should begin
78	* @param href_musthave restricts links to only those containing this string
79	* @param image_mustnothave restricts links and image names to only those that don't contain this string
80	* @param image_ignore restricts the beginning of image names
81	* @param image_type restricts the type of images included in the collage to those named */
82	public DownloadUrls(GsdlCollageApplet app,
83	DownloadImages download_images, String starting_url,
84	String href_musthave, String image_mustnothave,
85	String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
86	{
87	super("DownloadUrls");
88	app_ = app;
89	download_images_ = download_images;
90
91	starting_url_ = starting_url;
92	href_musthave_ = href_musthave;
93	image_mustnothave_ = image_mustnothave;
94	image_ignore_ = image_ignore;
95	image_type_ = image_type;
96	document_root_ = document_root;
97	verbosity_ = verbosity;
98	tracker = trk;
99
100	System.err.println("starting_url_ " + starting_url +"\n"+
101	"href_musthave_ " + href_musthave +"\n"+
102	"image_mustnothave_ " + image_mustnothave+"\n"+
103	"image_ignore_ "+ image_ignore+"\n"+
104	"image_type_ "+ image_type+"\n"+
105	"document root "+ document_root_
106	);
107
108
109
110
111	}
112
113
114	/** Determines whether or not a url has already been examined
115	*
116	* @param url_string the url to check
117	* @return true if the url has been visited, false if not */
118	public boolean already_visited(String url_string)
119	{
120	int hash_pos = url_string.indexOf("#");
121	if (hash_pos>0)
122	{
123	// strip off #anchor reference
124	url_string = url_string.substring(0,hash_pos);
125	}
126
127	// if the url has been visited before, return true
128	if (visited_url_.containsKey(url_string))
129	{
130	if (verbosity_ > 3)
131	{
132	System.err.println("Visited " + url_string + " before!");
133	}
134	return true;
135	}
136
137	visited_url_.put(url_string,"visited");
138
139	return false;
140	}
141
142	/** Determines whether or not an images or its screenview has been visited)
143	* has already been examined
144	*
145	* @param url_string the url to check
146	* @param img_name the image to check
147	* @return true if the url has been visited, false if not */
148	public boolean image_visited(String url_string, String img_name)
149	{
150	String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
151
152	if ( visited_images_.containsKey(hash_dir)){
153	Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
154
155	if (img_name.startsWith("screenview")){
156	return true;
157	}
158
159	if (hashed_images.containsKey(img_name)){
160	return true;
161	}
162
163	Enumeration enu = hashed_images.keys();
164	for(;enu.hasMoreElements();){
165	String name = (String)enu.nextElement();
166	if(name.startsWith("screenview")){
167	return true;
168	}
169	}
170
171	hashed_images.put(img_name,"visited");
172	}
173	else{
174	Hashtable hashed_images = new Hashtable();
175	hashed_images.put(img_name,"visited");
176	visited_images_.put(hash_dir,hashed_images);
177	}
178
179	return false;
180	}
181
182
183
184
185
186	/** Restricts the type of images that can be included in the collage
187	*
188	* @param url_string the url to check
189	* @return true if the image is of a specified type, false if not */
190	public boolean image_file_extension(String url_string)
191	{
192	// lower case comparisons
193	String url_lstring = url_string.toLowerCase();
194
195	if (image_type_ == null)
196	return true;
197
198	String tmp = image_type_;
199	String original_image_type_ = image_type_;
200
201	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
202
203	tmp = image_type_.substring(0, image_type_.indexOf("%"));
204
205	if (image_type_.length() > image_type_.indexOf("%") + 1)
206	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
207	else
208	image_type_ = null;
209
210	if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
211	image_type_ = original_image_type_;
212	return true;
213	}
214	}
215
216	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
217	image_type_ = original_image_type_;
218	return true;
219	}
220
221	image_type_ = original_image_type_;
222	return false;
223	}
224
225	/** Restricts images to only those that satisfy several specified conditions
226	* regarding the content of the image name and url.
227	*
228	* @param url_string the url to check
229	* @return true if the image is satisfactory, false if not */
230	public boolean filter_image(String url_string)
231	{
232
233	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
234	{
235	if (!already_visited(url_string))
236	{
237	if (image_mustnothave_ != null) {
238
239	String tmp = image_mustnothave_;
240	String original_image_mustnothave_ = image_mustnothave_;
241
242	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
243
244	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
245	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
246	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
247	image_mustnothave_.length());
248	else
249	image_mustnothave_ = null;
250
251
252
253	if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
254
255	image_mustnothave_ = original_image_mustnothave_;
256	return false;
257	}
258	}
259
260	image_mustnothave_ = original_image_mustnothave_;
261
262	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
263	image_mustnothave_ = original_image_mustnothave_;
264	return false;
265	}
266
267	if (verbosity_ > 2) {
268	System.err.println("src url = "+ url_string);
269	}
270
271	image_mustnothave_ = original_image_mustnothave_;
272
273	}
274
275	}
276
277	}
278
279	return true;
280	}
281
282	/** Restricts links to only those that satisfy several specified conditions
283	* regarding the address of the link.
284	*
285	* @param url_string the url to check
286	* @param new_url_string the url from which this link was found
287	* @param depth the number of links followed on this path
288	* @return true if the image is satisfactory, false if not */
289	public boolean filter_href(String url_string, String new_url_string, int depth)
290	{
291	boolean has_href = false;
292	String tmp = href_musthave_;
293	String original_href_musthave_ = href_musthave_;
294
295	// checks that it does contain this content
296	if (href_musthave_ != null) {
297
298	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
299
300	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
301	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
302	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
303	else
304	href_musthave_ = null;
305
306	if (url_string.indexOf(tmp) >= 0)
307	has_href = true;
308	}
309
310	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
311	has_href = true;
312
313	href_musthave_ = original_href_musthave_;
314	}
315
316	tmp = image_mustnothave_;
317	String original_image_mustnothave_ = image_mustnothave_;
318
319	// checks that it doesn't contain this content
320	if (image_mustnothave_ != null) {
321
322	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
323
324	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
325	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
326	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
327	else
328	image_mustnothave_ = null;
329
330	if (url_string.indexOf(tmp) >= 0)
331	has_href = false;
332	}
333	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
334	has_href = false;
335
336	image_mustnothave_ = original_image_mustnothave_;
337	}
338
339	// return true if the link is valid and false if not
340	if (href_musthave_==null \|\| has_href)
341	{
342	// might be another URL
343	if (depth < app_.maxDepth())
344	{
345	if (!new_url_string.startsWith(url_string))
346	{
347	return true;
348	}
349	}
350	}
351	return false;
352	}
353
354	/** Adds an image to the stored downloaded images as a triplet.
355	* Ensures that the number of images downloaded but not displayed at
356	* any one time is controlled by using a buffer. If the buffer is
357	* full this function will wait until space becomes available before
358	* continuing. It also restricts the
359	* total number of images to download as specified by the applet.
360	*
361	* @param url the image to download
362	* @param from_url the url that this image was sourced from
363	* @param img_name the name of the image */
364	public void add_image(URL url, String from_url, String img_name)
365	{
366	// get the image from the url
367	if (verbosity_>=2) {
368	System.err.println(" Downloading image URL: " + url.toString());
369	}
370
371	if (image_visited(url.toString(),img_name)) return;
372
373	int size = download_images_.downloadImage(tracker,url, from_url, img_name);
374
375	try{
376	// if have completed the maximum number of downloads for the
377	// application then stop
378	if (size == app_.maxDownloads()) {
379	stop();
380	}
381
382	}
383	catch (Exception e) {
384	thread_running_ = false;
385	stop();
386	e.printStackTrace();
387	}
388	}
389
390	/** Connects to the starting url and looks for all images and links from this
391	* original page. Image links are processed first, so that any images found can be
392	* downloaded immediately and placed on the applet. Secondly, the links to other
393	* pages are recursively processed by this function and treated as a starting url
394	*
395	* @param new_url the url from which to start searching for images and links
396	* @param depth the number of links that have been followed on this path */
397	public void rec_add_images(String new_url, int depth)
398	{
399
400	if (already_visited(new_url)) return;
401
402	// check if there is a scenario where external hyperlinks are being used
403	externalLinks();
404	String img_name = new String();
405
406	// connect to the url
407	CURL curl = new CURL(new_url);
408	if (curl.connected_ok())
409	{
410	if (verbosity_ >= 1) {
411	System.err.print("Connected OK ... ");
412	}
413
414	// read the page
415	curl.readAll();
416	if (verbosity_ >= 1) {
417	System.err.println("URL read.");
418	}
419
420	// get all the <code><img src=</code> links into a vector
421	Vector src_links = curl.getSrcLinks();
422
423
424	if (verbosity_ >= 2) {
425	System.err.println(" Got src links... there are " + src_links.size() + " of them.");
426	}
427	// process each of the image links according to the parameters given.
428	for (int i = 0; i < src_links.size(); i++)
429	{
430	URL url = (URL)src_links.get(i);
431	String url_string = url.toString();
432
433	//System.err.println(" source links " + i + " [" + url_string +"]");
434
435	if (verbosity_ >= 4) {
436	System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
437	}
438
439	if (image_file_extension(url_string))
440	{
441	if (filter_image(url_string))
442	{
443	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
444
445	if (verbosity_ >= 2) {
446	System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
447	}
448
449	if ((external_links_ != null) && (!external_links_.isEmpty())) {
450	String ext = (String) external_links_.get(img_name);
451
452
453	if (ext != null){
454	add_image(url, ext, img_name);
455
456	}
457	else{
458
459	add_image(url, new_url, img_name);
460	}
461	}
462	else {
463
464	add_image(url, new_url, img_name);
465	}
466
467
468	}
469
470	}
471
472	}
473
474	// get all the <code><a href=</code> links into a vector
475	Vector href_links = curl.getHrefLinks();
476
477
478	if (verbosity_ >= 2) {
479	System.err.println(" Got href links... there are " + href_links.size() + " of them.");
480	}
481
482
483	// process each of the href links according to the parameters given.
484	for (int i = 0; i < href_links.size(); i++)
485	{
486
487	URL url = (URL)href_links.get(i);
488	String url_string = url.toString();
489	//System.err.println(" href links " + i + "[" + url_string +"]");
490
491	if (image_file_extension(url_string))
492	{
493
494	if (filter_image(url_string))
495
496	{
497
498	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
499
500	if ((external_links_ != null) && (!external_links_.isEmpty())) {
501	String ext = (String) external_links_.get(img_name);
502
503	if (ext != null)
504	add_image(url, ext, img_name);
505	else
506	add_image(url, new_url, img_name);
507	}
508	else {
509	add_image(url, url_string, img_name);
510	}
511	}
512	}
513	else
514	{
515	if (filter_href(url_string,new_url,depth))
516	{
517
518	rec_add_images(url_string,depth+1);
519
520	}
521	}
522	}
523	}
524
525	else {
526	System.err.println("Unable able to download "+new_url);
527	}
528	}
529
530
531	/** Used in cases where the image maps to a url outside of it's original location.
532	* When used with Greenstone the collage images will refer to documents in the collections
533	* from which the images are sourced. When used individually, the images may be saved into
534	* a user directory and the pages they reference may be external hyperlinks.
535	* This function reads that external links file and creates a hash map of the image to
536	* its external hyperlink. If the file does not exist the download thread will continue
537	* and assume the first case, that links are internal. */
538	public void externalLinks() {
539	external_links_ = null;
540	try {
541
542	if (starting_url_ == null \|\| (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
543	return;
544	}
545
546	// open a url to the file written
547	URL u = new URL(starting_url_ + "externallinks");
548
549	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
550
551	external_links_ = new Hashtable();
552
553	String l = r.readLine();
554	// split the line of the space, first part is the image, second part the link
555	while (l != null) {
556
557	String tmp1 = new String();
558	String tmp2 = new String();
559
560	if (l.indexOf(" ") >= 0) {
561
562	tmp1 = l.substring(0, l.indexOf(" "));
563	if (l.length() > l.indexOf(" ") + 1)
564	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
565	else
566	tmp2 = null;
567
568	if (tmp2 != null) {
569
570	external_links_.put(tmp1, tmp2);
571	//System.err.println(tmp1 + " " + tmp2);
572	}
573	}
574	l = r.readLine();
575	}
576
577	r.close();
578
579	} catch (Exception e) {
580	e.printStackTrace();
581	return;
582	}
583	}
584
585	/** Controls the download thread */
586	public void run ()
587	{
588	System.err.println("Starting download thread.");
589	visited_url_ = new Hashtable();
590	visited_images_ = new Hashtable();
591
592	rec_add_images(starting_url_,1);
593	download_images_.stopDownload();
594	System.err.println("Download thread finished.");
595	}
596	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: