Context Navigation

DownloadUrls.java@ 38968

Last change on this file since 38968 was 38968, checked in by anupama, 6 weeks ago

GsdlCollageApplet. 1. On stopRunning, DisplayImages may be looping in the graphics thread and in that case would through an exception when interrupted. I know check isStopped() within the loop to more tidily exit in such a case. 2. Finally remembered to handle the case of if there's no GS3 running server or no internet connection of any kind. It used to be the case that GsdlCollageApplet would display the Downloading message forever, noticeable when run as a commandline application and if I forgot to run the GS3 server. Now if downloading fails it will say so and come to halt instead of waiting for downloads and displaying the message Downloading forever. 3. Some useful local changes of debugging output on an exception in CURL.

File size: 22.8 KB

Line
1	package org.greenstone.applet.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	import javax.swing.ImageIcon; //****
9
10	//import org.apache.log4j.*;
11
12	/**
13	* @author Katrina Edgar
14	* @author David Bainbridge
15	*
16	* Controls retrieval of images from the specified starting url. Follows appropriate
17	* links from this starting point, traversing in a tree-like state through several other
18	* pages. Filters images and links based on specified parameters. Also controls the quantity
19	* of downloading that occurs by restricting the number of downloaded images that are yet to
20	* be displayed to 10, and the total number of downloads allowed is also restricted by
21	* the applet application (to prevent downloading occuring infinitely). */
22
23	public class DownloadUrls extends Thread {
24
25	// for GS3
26	String baseURL = null;
27
28	/** Refers to applet */
29	GsdlCollageApplet app_ = null;
30	/** Refers to download thread */
31	DownloadImages download_images_ = null;
32
33	/** The address from which the application should start looking for images */
34	String starting_url_ = null;
35
36	/** the root directory of Greenstone*/
37	String document_root_ = null;
38
39	/** When this thread is asked to stop running, this variable will be set to true */
40	private boolean stop_running = false;
41
42	/** When this thread is asked to stop downloading, this variable will be set to true.
43	* For now the behaviour is the same as stop_running=true on this thread,
44	* but in case it changes in the future, we have a separate variable.
45	* Also, calling stopRunning() is not the same as setting stop_running = true, so
46	* to be careful, a separate variable for stop_downloading could be safer when coding.
47	*/
48	private boolean stop_downloading = false;
49
50	/** Set to true when unable to download perhaps because of no internet connection. */
51	private boolean unable_to_download = false;
52
53	/** CHRIS - Holds the contents of the collection's assoc directory */
54	// File[] assocDir_ = null;
55
56	/** Restricts links followed from the starting url to links that contain this string */
57	String href_musthave_ = null;
58	/** Restricts links followed from the starting url to links that do not contain this string.
59	* Also prevents image names from containing this string */
60	String image_mustnothave_ = null;
61	/** Ignore images whose names begin with this string */
62	String image_ignore_ = null;
63	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
64	String image_type_ = null;
65
66	/** A static delay used when attempting to download more images into a full downloading buffer */
67	final int delay_ = 3000;
68	/** The maximum number of images to have downloaded and not yet displayed */
69	final int buffer_size_ = 1;
70
71	/** Used in cases where the image maps to a url outside of it's original location.
72	* When used with Greenstone the collage images will refer to documents in the collections
73	* from which the images are sourced. When used individually, the images may be saved into
74	* a user directory and the pages they reference may be external hyperlinks. */
75	Hashtable external_links_ = null;
76
77	/** Records all urls which have already been examined */
78	Hashtable visited_url_ = null;
79	/** Determines whether there are still pages to examine and images to download */
80	boolean thread_running_ = true;
81
82	int verbosity_ = 0;
83
84	/** Records all images which have already been examined */
85	Hashtable visited_images_ = null;
86
87	MediaTracker tracker;
88
89	/** Constructor to initialise a download thread from which images are found,
90	* saves parameters into local variables for use within the class.
91	*
92	* @param app reference to the applet
93	* @param download_images class which stores the images retrieved in triplets
94	* @param starting_url the url from which the search for images should begin
95	* @param href_musthave restricts links to only those containing this string
96	* @param image_mustnothave restricts links and image names to only those that don't contain this string
97	* @param image_ignore restricts the beginning of image names
98	* @param image_type restricts the type of images included in the collage to those named */
99	public DownloadUrls(GsdlCollageApplet app,
100	DownloadImages download_images, String starting_url,
101	String href_musthave, String image_mustnothave,
102	String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
103	{
104	super("DownloadUrls");
105	app_ = app;
106	download_images_ = download_images;
107
108	starting_url_ = starting_url;
109	href_musthave_ = href_musthave;
110	image_mustnothave_ = image_mustnothave;
111	image_ignore_ = image_ignore;
112	image_type_ = image_type;
113	document_root_ = document_root;
114	verbosity_ = verbosity;
115	tracker = trk;
116
117	System.err.println("starting_url_ " + starting_url +"\n"+
118	"href_musthave_ " + href_musthave +"\n"+
119	"image_mustnothave_ " + image_mustnothave+"\n"+
120	"image_ignore_ "+ image_ignore+"\n"+
121	"image_type_ "+ image_type+"\n"+
122	"document_root_ "+ document_root_
123	);
124
125
126
127
128	}
129
130	/** Determines whether or not a url has already been examined
131	*
132	* @param url_string the url to check
133	* @return true if the url has been visited, false if not */
134	public boolean already_visited(String url_string)
135	{
136	int hash_pos = url_string.indexOf("#");
137	if (hash_pos>0)
138	{
139	// strip off #anchor reference
140	url_string = url_string.substring(0,hash_pos);
141	}
142
143	// if the url has been visited before, return true
144	if (visited_url_.containsKey(url_string))
145	{
146	if (verbosity_ > 3)
147	{
148	System.err.println("Visited " + url_string + " before!");
149	}
150	return true;
151	}
152
153	visited_url_.put(url_string,"visited");
154
155	return false;
156	}
157
158	/** Determines whether or not an images or its screenview has been visited)
159	* has already been examined
160	*
161	* @param url_string the url to check
162	* @param img_name the image to check
163	* @return true if the url has been visited, false if not */
164	public boolean image_visited(String url_string, String img_name)
165	{
166	String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
167
168	if ( visited_images_.containsKey(hash_dir)){
169	Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
170
171	if (img_name.startsWith("screenview")){
172	return true;
173	}
174
175	if (hashed_images.containsKey(img_name)){
176	return true;
177	}
178
179	Enumeration enu = hashed_images.keys();
180	for(;enu.hasMoreElements();){
181	String name = (String)enu.nextElement();
182	if(name.startsWith("screenview")){
183	return true;
184	}
185	}
186
187	hashed_images.put(img_name,"visited");
188	}
189	else{
190	Hashtable hashed_images = new Hashtable();
191	hashed_images.put(img_name,"visited");
192	visited_images_.put(hash_dir,hashed_images);
193	}
194
195	return false;
196	}
197
198	// some other thread can call this method to tell this thread to stop running
199	public void stopRunning() {
200	if (verbosity_ >= 3) {
201	System.err.println("**** DownloadUrls.stopRunning() called");
202	}
203
204	stop_running = true;
205	// Interrupt this thread, even if it's not the one running
206	// Just want to make sure the DownloadURls' thread the CURL object runs in
207	// gets interrupted if it's what's currently running
208	if(!this.isInterrupted()) {
209	this.interrupt();
210	}
211	if(!Thread.currentThread().isInterrupted()) {
212	Thread.currentThread().interrupt();
213	}
214	}
215
216	public boolean isStopping() {
217	return stop_running;
218	}
219
220
221	/** Restricts the type of images that can be included in the collage
222	*
223	* @param url_string the url to check
224	* @return true if the image is of a specified type, false if not */
225	public boolean image_file_extension(String url_string)
226	{
227	// lower case comparisons
228	String url_lstring = url_string.toLowerCase();
229
230
231	// greenstone3 can add jsessionids at end, which messes up image file extension detection
232	int jsessionID_index = url_lstring.indexOf(";jsessionid=");
233	if(jsessionID_index >= 0) {
234	url_lstring = url_lstring.substring(0, jsessionID_index);
235	}
236
237	if (image_type_ == null)
238	return true;
239
240	String tmp = image_type_;
241	String original_image_type_ = image_type_;
242
243	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
244
245	tmp = image_type_.substring(0, image_type_.indexOf("%"));
246
247	if (image_type_.length() > image_type_.indexOf("%") + 1)
248	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
249	else
250	image_type_ = null;
251
252	if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
253	image_type_ = original_image_type_;
254	return true;
255	}
256	}
257
258	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
259	image_type_ = original_image_type_;
260	return true;
261	}
262
263	image_type_ = original_image_type_;
264	return false;
265	}
266
267	/** Restricts images to only those that satisfy several specified conditions
268	* regarding the content of the image name and url.
269	*
270	* @param url_string the url to check
271	* @return true if the image is satisfactory, false if not */
272	public boolean filter_image(String url_string)
273	{
274
275	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
276	{
277	if (!already_visited(url_string))
278	{
279	if (image_mustnothave_ != null) {
280
281	String tmp = image_mustnothave_;
282	String original_image_mustnothave_ = image_mustnothave_;
283
284	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
285
286	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
287	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
288	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
289	image_mustnothave_.length());
290	else
291	image_mustnothave_ = null;
292
293
294
295	if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
296
297	image_mustnothave_ = original_image_mustnothave_;
298	return false;
299	}
300	}
301
302	image_mustnothave_ = original_image_mustnothave_;
303
304	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
305	image_mustnothave_ = original_image_mustnothave_;
306	return false;
307	}
308
309	if (verbosity_ > 2) {
310	System.err.println("src url = "+ url_string);
311	}
312
313	image_mustnothave_ = original_image_mustnothave_;
314
315	}
316
317	} else { // already visited this image link
318	System.err.println("\t####" + url_string + " already visited - filter_image returning false");
319	// Isn't it that if we've already visited the image link once before,
320	// we've dealt with it anyway once before (in one way or another: decided it
321	// didn't pass the filter, or added the image for download if it did pass the
322	// filters ) so we don't process this image again again?
323	return false;
324	}
325
326	}
327
328	return true;
329	}
330
331	/** Restricts links to only those that satisfy several specified conditions
332	* regarding the address of the link.
333	*
334	* @param url_string the url to check
335	* @param new_url_string the url from which this link was found
336	* @param depth the number of links followed on this path
337	* @return true if the image is satisfactory, false if not */
338	public boolean filter_href(String url_string, String new_url_string, int depth)
339	{
340	boolean has_href = false;
341	String tmp = href_musthave_;
342	String original_href_musthave_ = href_musthave_;
343
344	// checks that it does contain this content
345	if (href_musthave_ != null) {
346
347	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
348
349	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
350	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
351	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
352	else
353	href_musthave_ = null;
354
355	if (url_string.indexOf(tmp) >= 0)
356	has_href = true;
357	}
358
359	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
360	has_href = true;
361
362	href_musthave_ = original_href_musthave_;
363	}
364
365	tmp = image_mustnothave_;
366	String original_image_mustnothave_ = image_mustnothave_;
367
368	// checks that it doesn't contain this content
369	if (image_mustnothave_ != null) {
370
371	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
372
373	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
374	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
375	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
376	else
377	image_mustnothave_ = null;
378
379	if (url_string.indexOf(tmp) >= 0)
380	has_href = false;
381	}
382	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
383	has_href = false;
384
385	image_mustnothave_ = original_image_mustnothave_;
386	}
387
388	// return true if the link is valid and false if not
389	if (href_musthave_==null \|\| has_href)
390	{
391	// might be another URL
392	if (depth < app_.maxDepth())
393	{
394	if (!new_url_string.startsWith(url_string))
395	{
396	return true;
397	}
398	}
399	}
400	return false;
401	}
402
403	/** Adds an image to the stored downloaded images as a triplet.
404	* Ensures that the number of images downloaded but not displayed at
405	* any one time is controlled by using a buffer. If the buffer is
406	* full this function will wait until space becomes available before
407	* continuing. It also restricts the
408	* total number of images to download as specified by the applet.
409	*
410	* @param url the image to download
411	* @param from_url the url that this image was sourced from
412	* @param img_name the name of the image */
413	public void add_image(URL url, String from_url, String img_name)
414	{
415	// get the image from the url
416	if (verbosity_>=2) {
417	System.err.println(" Downloading image URL: " + url.toString());
418	}
419
420	if (image_visited(url.toString(),img_name)) return;
421
422	int size = download_images_.downloadImage(tracker,url, from_url, img_name);
423
424	try{
425	// if have completed the maximum number of downloads for the
426	// application then stop downloading
427	if (size == app_.maxDownloads()) {
428	// NOTE: the app can continue displaying images forever after download is
429	// finished, until interrupted/stopped.
430	// So don't set stop_running=false just because downloads have finished.
431	//stop_running = true; // Don't do this!
432	//thread_running = false;
433	//thread.currentThread().interrupt();
434
435	stop_downloading = true;
436	//stop(); // TODO, remove this, replacing with above
437
438	}
439
440	}
441	catch (Exception e) {
442	thread_running_ = false;
443	//stop(); // TODO
444	stop_downloading = true;
445	e.printStackTrace();
446	}
447	}
448
449	/** Connects to the starting url and looks for all images and links from this
450	* original page. Image links are processed first, so that any images found can be
451	* downloaded immediately and placed on the applet. Secondly, the links to other
452	* pages are recursively processed by this function and treated as a starting url
453	*
454	* @param new_url the url from which to start searching for images and links
455	* @param depth the number of links that have been followed on this path */
456	public void rec_add_images(String new_url, int depth)
457	{
458	// Check if the application's stopping, to end this recursive function as soon as possible
459	if(stop_running) {
460	return;
461	}
462
463	if (verbosity_ >= 2) {
464	System.err.println("*** Inspecting url: " + new_url);
465	}
466
467	if (already_visited(new_url)) return;
468
469	// check if there is a scenario where external hyperlinks are being used
470	externalLinks();
471	String img_name = new String();
472
473	// connect to the url
474	// stopRunning would have set the interrupted flag, and
475	// CURL checks for that in its loop, outside its potentially-blocking read() call
476	CURL curl = (app_.gsdlversion == 3) ? new CURL(new_url, app_.baseURL) : new CURL(new_url);
477
478	if (curl.connected_ok())
479	{
480	if (verbosity_ >= 1) {
481	System.err.print("Connected OK ... ");
482	}
483
484	// read the page
485	curl.readAll();
486	if (verbosity_ >= 1) {
487	System.err.println("URL read.");
488	}
489
490	// get all the <code><img src=</code> links into a vector
491	Vector src_links = curl.getSrcLinks();
492
493	if (verbosity_ >= 2) {
494	System.err.println(" Got src links... there are " + src_links.size() + " of them.");
495	}
496	// process each of the image links according to the parameters given.
497	for (int i = 0; i < src_links.size() && !stop_running && !stop_downloading; i++)
498	{
499	URL url = (URL)src_links.get(i);
500	String url_string = url.toString();
501
502	//System.err.println(" source links " + i + " [" + url_string +"]");
503
504	if (verbosity_ >= 4) {
505	System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
506	}
507
508	if (image_file_extension(url_string))
509	{
510	if (filter_image(url_string))
511	{
512	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
513
514	if (verbosity_ >= 2) {
515	System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
516	}
517
518	if ((external_links_ != null) && (!external_links_.isEmpty())) {
519	String ext = (String) external_links_.get(img_name);
520
521
522	if (ext != null){
523	add_image(url, ext, img_name);
524
525	}
526	else{
527
528	add_image(url, new_url, img_name);
529	}
530	}
531	else {
532
533	add_image(url, new_url, img_name);
534	}
535
536
537	}
538
539	}
540
541	}
542
543	if(stop_running && verbosity_ >= 3) {
544	System.err.println("*** DownloadUrls.rec_add_images() - Asked to stop running");
545	return;
546	}
547
548	// get all the <code><a href=</code> links into a vector
549	Vector href_links = curl.getHrefLinks();
550
551	if (verbosity_ >= 2) {
552	System.err.println(" Got href links... there are " + href_links.size() + " of them.");
553	}
554
555
556	// process each of the href links according to the parameters given.
557	for (int i = 0; i < href_links.size() && !stop_running && !stop_downloading; i++)
558	{
559
560	URL url = (URL)href_links.get(i);
561	String url_string = url.toString();
562	//System.err.println(" href links " + i + "[" + url_string +"]");
563
564	if (image_file_extension(url_string))
565	{
566
567	if (filter_image(url_string))
568
569	{
570
571	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
572	if (verbosity_ >= 2) {
573	System.err.println(" Filtered: href_link[" + i + "] = " + url_string);
574	}
575	if ((external_links_ != null) && (!external_links_.isEmpty())) {
576	String ext = (String) external_links_.get(img_name);
577
578	if (ext != null)
579	add_image(url, ext, img_name);
580	else
581	add_image(url, new_url, img_name);
582	}
583	else {
584	add_image(url, url_string, img_name);
585	}
586	}
587	}
588	else
589	{
590	if (filter_href(url_string,new_url,depth))
591	{
592	// If application has stopped, then don't do the
593	// recursive call, so we stop faster before exploring yet
594	// more links and deciding to stop then
595	if(stop_running) {
596	return;
597	}
598	rec_add_images(url_string,depth+1);
599
600	}
601	}
602	}
603	}
604
605	else {
606	System.err.println("Unable to download "+new_url);
607	unable_to_download = true;
608	}
609
610	if(stop_running && verbosity_ >= 3) {
611	System.err.println("*** DownloadUrls.rec_add_images() thread has been told to stop.");
612	}
613	}
614
615	public boolean wasUnableToDownload() { return unable_to_download; }
616
617
618	/** Used in cases where the image maps to a url outside of it's original location.
619	* When used with Greenstone the collage images will refer to documents in the collections
620	* from which the images are sourced. When used individually, the images may be saved into
621	* a user directory and the pages they reference may be external hyperlinks.
622	* This function reads that external links file and creates a hash map of the image to
623	* its external hyperlink. If the file does not exist the download thread will continue
624	* and assume the first case, that links are internal. */
625	public void externalLinks() {
626	external_links_ = null;
627	try {
628
629	if (starting_url_ == null \|\| (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
630	if (verbosity_ >= 3) {
631	System.err.println("**** " + starting_url_ + " is not an external link.");
632	}
633	return;
634	}
635
636	// open a url to the file written
637	URL u = new URL(starting_url_ + "externallinks");
638
639	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
640
641	external_links_ = new Hashtable();
642
643	String l = r.readLine();
644	// split the line of the space, first part is the image, second part the link
645	while (l != null && !stop_running && !stop_downloading) {
646
647	String tmp1 = new String();
648	String tmp2 = new String();
649
650	if (l.indexOf(" ") >= 0) {
651
652	tmp1 = l.substring(0, l.indexOf(" "));
653	if (l.length() > l.indexOf(" ") + 1)
654	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
655	else
656	tmp2 = null;
657
658	if (tmp2 != null) {
659
660	external_links_.put(tmp1, tmp2);
661	//System.err.println(tmp1 + " " + tmp2);
662	}
663	}
664	l = r.readLine();
665	}
666
667	r.close();
668
669	if(stop_running && verbosity_ >= 3) {
670	System.err.println("*** DownloadUrls.externalLinks(): Asked to stop running");
671	}
672
673	} catch (Exception e) {
674	e.printStackTrace();
675	return;
676	}
677	}
678
679	/** Controls the download thread */
680	public void run ()
681	{
682	System.err.println("Starting download thread.");
683	visited_url_ = new Hashtable();
684	visited_images_ = new Hashtable();
685
686	rec_add_images(starting_url_,1);
687	download_images_.stopDownload();
688	System.err.println("DownloadUrls.run() - download thread finished.");
689	}
690	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/applet/GsdlCollageApplet/DownloadUrls.java@ 38968

Download in other formats: