Context Navigation

DownloadUrls.java@ 38871

Last change on this file since 38871 was 38871, checked in by anupama, 3 months ago

I've got the CURL and DownloadURLs methods now finding the right image URLs and suffixing them to the correct baseURL. I'm still hardcoding some applet params, including new ones, into the java code to get it this far. However, I'm now hitting a security exception when it tries to download the first correct image whose URL it works out, now it's working those out properly. Some googling seemed to indicate that the applet needs to be signed or not be running in a sandbox. I'm wondering if the appletviewer, not being the server, doesn't allow the applet to not access images on a distinct server URL and that if I could get the applet running as webswing on the server itself, it would have the right permissions to access/download the images. That will be the next step. If that doesn't work, I will need to first try to rewrite this JApplet as an application and see if that change makes a difference.

File size: 20.0 KB

Line
1	package org.greenstone.applet.GsdlCollageApplet;
2
3	import java.awt.*;
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7
8	import javax.swing.ImageIcon; //****
9
10
11	/**
12	* @author Katrina Edgar
13	* @author David Bainbridge
14	*
15	* Controls retrieval of images from the specified starting url. Follows appropriate
16	* links from this starting point, traversing in a tree-like state through several other
17	* pages. Filters images and links based on specified parameters. Also controls the quantity
18	* of downloading that occurs by restricting the number of downloaded images that are yet to
19	* be displayed to 10, and the total number of downloads allowed is also restricted by
20	* the applet application (to prevent downloading occuring infinitely). */
21
22	public class DownloadUrls extends Thread {
23	// for GS3
24	String gs3CollImgPath = null;
25	String baseURL = null;
26
27	/** Refers to applet */
28	GsdlCollageApplet app_ = null;
29	/** Refers to download thread */
30	DownloadImages download_images_ = null;
31
32	/** The address from which the application should start looking for images */
33	String starting_url_ = null;
34
35	/** the root directory of Greenstone*/
36	String document_root_ = null;
37
38
39	/** CHRIS - Holds the contents of the collection's assoc directory */
40	// File[] assocDir_ = null;
41
42	/** Restricts links followed from the starting url to links that contain this string */
43	String href_musthave_ = null;
44	/** Restricts links followed from the starting url to links that do not contain this string.
45	* Also prevents image names from containing this string */
46	String image_mustnothave_ = null;
47	/** Ignore images whose names begin with this string */
48	String image_ignore_ = null;
49	/** Restricts the types of images included in the collage, for example jpg, gif, etc. */
50	String image_type_ = null;
51
52	/** A static delay used when attempting to download more images into a full downloading buffer */
53	final int delay_ = 3000;
54	/** The maximum number of images to have downloaded and not yet displayed */
55	final int buffer_size_ = 1;
56
57	/** Used in cases where the image maps to a url outside of it's original location.
58	* When used with Greenstone the collage images will refer to documents in the collections
59	* from which the images are sourced. When used individually, the images may be saved into
60	* a user directory and the pages they reference may be external hyperlinks. */
61	Hashtable external_links_ = null;
62
63	/** Records all urls which have already been examined */
64	Hashtable visited_url_ = null;
65	/** Determines whether there are still pages to examine and images to download */
66	boolean thread_running_ = true;
67
68	int verbosity_ = 0;
69
70	/** Records all images which have already been examined */
71	Hashtable visited_images_ = null;
72
73	MediaTracker tracker;
74
75	/** Constructor to initialise a download thread from which images are found,
76	* saves parameters into local variables for use within the class.
77	*
78	* @param app reference to the applet
79	* @param download_images class which stores the images retrieved in triplets
80	* @param starting_url the url from which the search for images should begin
81	* @param href_musthave restricts links to only those containing this string
82	* @param image_mustnothave restricts links and image names to only those that don't contain this string
83	* @param image_ignore restricts the beginning of image names
84	* @param image_type restricts the type of images included in the collage to those named */
85	public DownloadUrls(GsdlCollageApplet app,
86	DownloadImages download_images, String starting_url,
87	String href_musthave, String image_mustnothave,
88	String image_ignore, String image_type, String document_root,int verbosity, MediaTracker trk)
89	{
90	super("DownloadUrls");
91	app_ = app;
92	download_images_ = download_images;
93
94	starting_url_ = starting_url;
95	href_musthave_ = href_musthave;
96	image_mustnothave_ = image_mustnothave;
97	image_ignore_ = image_ignore;
98	image_type_ = image_type;
99	document_root_ = document_root;
100	verbosity_ = verbosity;
101	tracker = trk;
102
103	System.err.println("starting_url_ " + starting_url +"\n"+
104	"href_musthave_ " + href_musthave +"\n"+
105	"image_mustnothave_ " + image_mustnothave+"\n"+
106	"image_ignore_ "+ image_ignore+"\n"+
107	"image_type_ "+ image_type+"\n"+
108	"document root "+ document_root_
109	);
110
111
112
113
114	}
115
116	public void setupForGS3(String gs3CollImgPath, String baseURL)
117	{
118	this.gs3CollImgPath = gs3CollImgPath;
119	this.baseURL = baseURL;
120	}
121
122	/** Determines whether or not a url has already been examined
123	*
124	* @param url_string the url to check
125	* @return true if the url has been visited, false if not */
126	public boolean already_visited(String url_string)
127	{
128	int hash_pos = url_string.indexOf("#");
129	if (hash_pos>0)
130	{
131	// strip off #anchor reference
132	url_string = url_string.substring(0,hash_pos);
133	}
134
135	// if the url has been visited before, return true
136	if (visited_url_.containsKey(url_string))
137	{
138	if (verbosity_ > 3)
139	{
140	System.err.println("Visited " + url_string + " before!");
141	}
142	return true;
143	}
144
145	visited_url_.put(url_string,"visited");
146
147	return false;
148	}
149
150	/** Determines whether or not an images or its screenview has been visited)
151	* has already been examined
152	*
153	* @param url_string the url to check
154	* @param img_name the image to check
155	* @return true if the url has been visited, false if not */
156	public boolean image_visited(String url_string, String img_name)
157	{
158	String hash_dir = url_string.substring(0,url_string.lastIndexOf("/"));
159
160	if ( visited_images_.containsKey(hash_dir)){
161	Hashtable hashed_images = (Hashtable)visited_images_.get(hash_dir);
162
163	if (img_name.startsWith("screenview")){
164	return true;
165	}
166
167	if (hashed_images.containsKey(img_name)){
168	return true;
169	}
170
171	Enumeration enu = hashed_images.keys();
172	for(;enu.hasMoreElements();){
173	String name = (String)enu.nextElement();
174	if(name.startsWith("screenview")){
175	return true;
176	}
177	}
178
179	hashed_images.put(img_name,"visited");
180	}
181	else{
182	Hashtable hashed_images = new Hashtable();
183	hashed_images.put(img_name,"visited");
184	visited_images_.put(hash_dir,hashed_images);
185	}
186
187	return false;
188	}
189
190
191
192
193
194	/** Restricts the type of images that can be included in the collage
195	*
196	* @param url_string the url to check
197	* @return true if the image is of a specified type, false if not */
198	public boolean image_file_extension(String url_string)
199	{
200	// lower case comparisons
201	String url_lstring = url_string.toLowerCase();
202
203
204	// greenstone3 can add jsessionids at end, which messes up image file extension detection
205	int jsessionID_index = url_lstring.indexOf(";jsessionid=");
206	if(jsessionID_index >= 0) {
207	url_lstring = url_lstring.substring(0, jsessionID_index);
208	}
209
210	if (image_type_ == null)
211	return true;
212
213	String tmp = image_type_;
214	String original_image_type_ = image_type_;
215
216	while (image_type_ != null && image_type_.indexOf("%") >= 0) {
217
218	tmp = image_type_.substring(0, image_type_.indexOf("%"));
219
220	if (image_type_.length() > image_type_.indexOf("%") + 1)
221	image_type_ = image_type_.substring(image_type_.indexOf("%") + 1, image_type_.length());
222	else
223	image_type_ = null;
224
225	if (!tmp.trim().equals("") && url_lstring.endsWith(tmp)) {
226	image_type_ = original_image_type_;
227	return true;
228	}
229	}
230
231	if (image_type_ != null && url_lstring.endsWith(image_type_)) {
232	image_type_ = original_image_type_;
233	return true;
234	}
235
236	image_type_ = original_image_type_;
237	return false;
238	}
239
240	/** Restricts images to only those that satisfy several specified conditions
241	* regarding the content of the image name and url.
242	*
243	* @param url_string the url to check
244	* @return true if the image is satisfactory, false if not */
245	public boolean filter_image(String url_string)
246	{
247
248	if (image_ignore_==null \|\| !url_string.startsWith(image_ignore_))
249	{
250	if (!already_visited(url_string))
251	{
252	if (image_mustnothave_ != null) {
253
254	String tmp = image_mustnothave_;
255	String original_image_mustnothave_ = image_mustnothave_;
256
257	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
258
259	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
260	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
261	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1,
262	image_mustnothave_.length());
263	else
264	image_mustnothave_ = null;
265
266
267
268	if (!tmp.trim().equals("") && url_string.indexOf(tmp) >= 0) {
269
270	image_mustnothave_ = original_image_mustnothave_;
271	return false;
272	}
273	}
274
275	image_mustnothave_ = original_image_mustnothave_;
276
277	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0) {
278	image_mustnothave_ = original_image_mustnothave_;
279	return false;
280	}
281
282	if (verbosity_ > 2) {
283	System.err.println("src url = "+ url_string);
284	}
285
286	image_mustnothave_ = original_image_mustnothave_;
287
288	}
289
290	} else { // already visited this image link
291	System.err.println("\t####" + url_string + " already visited - filter_image returning false");
292	// Isn't it that if we've already visited the image link once before,
293	// we've dealt with it anyway once before (in one way or another: decided it
294	// didn't pass the filter, or added the image for download if it did pass the
295	// filters ) so we don't process this image again again?
296	return false;
297	}
298
299	}
300
301	return true;
302	}
303
304	/** Restricts links to only those that satisfy several specified conditions
305	* regarding the address of the link.
306	*
307	* @param url_string the url to check
308	* @param new_url_string the url from which this link was found
309	* @param depth the number of links followed on this path
310	* @return true if the image is satisfactory, false if not */
311	public boolean filter_href(String url_string, String new_url_string, int depth)
312	{
313	boolean has_href = false;
314	String tmp = href_musthave_;
315	String original_href_musthave_ = href_musthave_;
316
317	// checks that it does contain this content
318	if (href_musthave_ != null) {
319
320	while (href_musthave_ != null && href_musthave_.indexOf("%") >= 0) {
321
322	tmp = href_musthave_.substring(0, href_musthave_.indexOf("%"));
323	if (href_musthave_.length() > href_musthave_.indexOf("%") + 1)
324	href_musthave_ = href_musthave_.substring(href_musthave_.indexOf("%") + 1, href_musthave_.length());
325	else
326	href_musthave_ = null;
327
328	if (url_string.indexOf(tmp) >= 0)
329	has_href = true;
330	}
331
332	if (href_musthave_ != null && url_string.indexOf(href_musthave_) >= 0)
333	has_href = true;
334
335	href_musthave_ = original_href_musthave_;
336	}
337
338	tmp = image_mustnothave_;
339	String original_image_mustnothave_ = image_mustnothave_;
340
341	// checks that it doesn't contain this content
342	if (image_mustnothave_ != null) {
343
344	while (image_mustnothave_ != null && image_mustnothave_.indexOf("%") >= 0) {
345
346	tmp = image_mustnothave_.substring(0, image_mustnothave_.indexOf("%"));
347	if (image_mustnothave_.length() > image_mustnothave_.indexOf("%") + 1)
348	image_mustnothave_ = image_mustnothave_.substring(image_mustnothave_.indexOf("%") + 1, image_mustnothave_.length());
349	else
350	image_mustnothave_ = null;
351
352	if (url_string.indexOf(tmp) >= 0)
353	has_href = false;
354	}
355	if (image_mustnothave_ != null && url_string.indexOf(image_mustnothave_) >= 0)
356	has_href = false;
357
358	image_mustnothave_ = original_image_mustnothave_;
359	}
360
361	// return true if the link is valid and false if not
362	if (href_musthave_==null \|\| has_href)
363	{
364	// might be another URL
365	if (depth < app_.maxDepth())
366	{
367	if (!new_url_string.startsWith(url_string))
368	{
369	return true;
370	}
371	}
372	}
373	return false;
374	}
375
376	/** Adds an image to the stored downloaded images as a triplet.
377	* Ensures that the number of images downloaded but not displayed at
378	* any one time is controlled by using a buffer. If the buffer is
379	* full this function will wait until space becomes available before
380	* continuing. It also restricts the
381	* total number of images to download as specified by the applet.
382	*
383	* @param url the image to download
384	* @param from_url the url that this image was sourced from
385	* @param img_name the name of the image */
386	public void add_image(URL url, String from_url, String img_name)
387	{
388	// get the image from the url
389	if (verbosity_>=2) {
390	System.err.println(" Downloading image URL: " + url.toString());
391	}
392
393	if (image_visited(url.toString(),img_name)) return;
394
395	int size = download_images_.downloadImage(tracker,url, from_url, img_name);
396
397	try{
398	// if have completed the maximum number of downloads for the
399	// application then stop
400	if (size == app_.maxDownloads()) {
401	stop();
402	}
403
404	}
405	catch (Exception e) {
406	thread_running_ = false;
407	stop();
408	e.printStackTrace();
409	}
410	}
411
412	/** Connects to the starting url and looks for all images and links from this
413	* original page. Image links are processed first, so that any images found can be
414	* downloaded immediately and placed on the applet. Secondly, the links to other
415	* pages are recursively processed by this function and treated as a starting url
416	*
417	* @param new_url the url from which to start searching for images and links
418	* @param depth the number of links that have been followed on this path */
419	public void rec_add_images(String new_url, int depth)
420	{
421
422	if (verbosity_ >= 2) {
423	System.err.println("*** Inspecting url: " + new_url);
424	}
425
426	if (already_visited(new_url)) return;
427
428	// check if there is a scenario where external hyperlinks are being used
429	externalLinks();
430	String img_name = new String();
431
432	// connect to the url
433	CURL curl = (app_.gsdlversion == 3) ? new CURL(new_url, this.baseURL) : new CURL(new_url);
434
435	if (curl.connected_ok())
436	{
437	if (verbosity_ >= 1) {
438	System.err.print("Connected OK ... ");
439	}
440
441	// read the page
442	curl.readAll();
443	if (verbosity_ >= 1) {
444	System.err.println("URL read.");
445	}
446
447	// get all the <code><img src=</code> links into a vector
448	Vector src_links = curl.getSrcLinks();
449
450	if (verbosity_ >= 2) {
451	System.err.println(" Got src links... there are " + src_links.size() + " of them.");
452	}
453	// process each of the image links according to the parameters given.
454	for (int i = 0; i < src_links.size(); i++)
455	{
456	URL url = (URL)src_links.get(i);
457	String url_string = url.toString();
458
459	//System.err.println(" source links " + i + " [" + url_string +"]");
460
461	if (verbosity_ >= 4) {
462	System.err.println(" Unfiltered: src_link[" + i + "] = " + url_string);
463	}
464
465	if (image_file_extension(url_string))
466	{
467	if (filter_image(url_string))
468	{
469	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
470
471	if (verbosity_ >= 2) {
472	System.err.println(" Filtered: src_link[" + i + "] = " + url_string);
473	}
474
475	if ((external_links_ != null) && (!external_links_.isEmpty())) {
476	String ext = (String) external_links_.get(img_name);
477
478
479	if (ext != null){
480	add_image(url, ext, img_name);
481
482	}
483	else{
484
485	add_image(url, new_url, img_name);
486	}
487	}
488	else {
489
490	add_image(url, new_url, img_name);
491	}
492
493
494	}
495
496	}
497
498	}
499
500	// get all the <code><a href=</code> links into a vector
501	Vector href_links = curl.getHrefLinks();
502
503	if (verbosity_ >= 2) {
504	System.err.println(" Got href links... there are " + href_links.size() + " of them.");
505	}
506
507
508	// process each of the href links according to the parameters given.
509	for (int i = 0; i < href_links.size(); i++)
510	{
511
512	URL url = (URL)href_links.get(i);
513	String url_string = url.toString();
514	//System.err.println(" href links " + i + "[" + url_string +"]");
515
516	if (image_file_extension(url_string))
517	{
518
519	if (filter_image(url_string))
520
521	{
522
523	img_name = url_string.substring(url_string.lastIndexOf("/") + 1, url_string.length());
524	if (verbosity_ >= 2) {
525	System.err.println(" Filtered: href_link[" + i + "] = " + url_string);
526	}
527	if ((external_links_ != null) && (!external_links_.isEmpty())) {
528	String ext = (String) external_links_.get(img_name);
529
530	if (ext != null)
531	add_image(url, ext, img_name);
532	else
533	add_image(url, new_url, img_name);
534	}
535	else {
536	add_image(url, url_string, img_name);
537	}
538	}
539	}
540	else
541	{
542	if (filter_href(url_string,new_url,depth))
543	{
544
545	rec_add_images(url_string,depth+1);
546
547	}
548	}
549	}
550	}
551
552	else {
553	System.err.println("Unable able to download "+new_url);
554	}
555	}
556
557
558	/** Used in cases where the image maps to a url outside of it's original location.
559	* When used with Greenstone the collage images will refer to documents in the collections
560	* from which the images are sourced. When used individually, the images may be saved into
561	* a user directory and the pages they reference may be external hyperlinks.
562	* This function reads that external links file and creates a hash map of the image to
563	* its external hyperlink. If the file does not exist the download thread will continue
564	* and assume the first case, that links are internal. */
565	public void externalLinks() {
566	external_links_ = null;
567	try {
568
569	if (starting_url_ == null \|\| (document_root_ != null && starting_url_.indexOf(document_root_) >= 0) ){
570	if (verbosity_ >= 3) {
571	System.err.println("**** " + starting_url_ + " is not an external link.");
572	}
573	return;
574	}
575
576	// open a url to the file written
577	URL u = new URL(starting_url_ + "externallinks");
578
579	BufferedReader r = new BufferedReader(new InputStreamReader(u.openStream()));
580
581	external_links_ = new Hashtable();
582
583	String l = r.readLine();
584	// split the line of the space, first part is the image, second part the link
585	while (l != null) {
586
587	String tmp1 = new String();
588	String tmp2 = new String();
589
590	if (l.indexOf(" ") >= 0) {
591
592	tmp1 = l.substring(0, l.indexOf(" "));
593	if (l.length() > l.indexOf(" ") + 1)
594	tmp2 = l.substring(l.indexOf(" ") + 1, l.length());
595	else
596	tmp2 = null;
597
598	if (tmp2 != null) {
599
600	external_links_.put(tmp1, tmp2);
601	//System.err.println(tmp1 + " " + tmp2);
602	}
603	}
604	l = r.readLine();
605	}
606
607	r.close();
608
609	} catch (Exception e) {
610	e.printStackTrace();
611	return;
612	}
613	}
614
615	/** Controls the download thread */
616	public void run ()
617	{
618	System.err.println("Starting download thread.");
619	visited_url_ = new Hashtable();
620	visited_images_ = new Hashtable();
621
622	rec_add_images(starting_url_,1);
623	download_images_.stopDownload();
624	System.err.println("Download thread finished.");
625	}
626	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/applet/GsdlCollageApplet/DownloadUrls.java@ 38871

Download in other formats: