Context Navigation

source: main/trunk/greenstone2/perllib/giget.pm@ 24600

Last change on this file since 24600 was 24600, checked in by ak19, 13 years ago
Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the binary that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.
Property svn:keywords set to `Author Date Id Revision`
File size: 5.7 KB

Line
1	use strict;
2	use util;
3
4
5	sub readin_html
6	{
7	my ($html_fname) = @_;
8
9	open(HIN,"<$html_fname")
10	\|\| die "Unable to open $html_fname: $!\n";
11
12	my $html_text;
13	my $line;
14	while (defined ($line=<HIN>)) {
15	$html_text .= $line;
16	}
17	close(HIN);
18
19	return $html_text;
20	}
21
22	sub stripout_anchortags
23	{
24	my ($html_text) = @_;
25
26	my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
27
28	return @anchor_tags;
29	}
30
31
32	sub print_tags
33	{
34	my (@tags) = @_;
35
36	my $a;
37	foreach $a ( @tags) {
38	print "$a\n";
39	}
40	}
41
42	sub filter_tags
43	{
44	my ($filter_text,@tags) = @_;
45
46	my @filtered_tags = ();
47
48	my $t;
49	foreach $t (@tags) {
50	if ($t =~ m/$filter_text/x) {
51	push(@filtered_tags,$t);
52	}
53	}
54
55	return @filtered_tags;
56	}
57
58	sub extract_urls {
59	my (@tags) = @_;
60
61	my @urls = ();
62
63	my $t;
64	foreach $t (@tags) {
65	if ($t =~ m/href=([^ ]+)/i) {
66	my $url = $1;
67	$url =~ s/&/&/g;
68	push(@urls,$url);
69	}
70	}
71
72	return @urls;
73	}
74
75	sub get_gi_page
76	{
77	my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
78
79	my $full_url = "$cgi_base$cgi_call";
80
81	if ((!-e $downloadto_fname) \|\| (-z $downloadto_fname)) {
82	my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
83	## print STDERR "*** wget cmd:\n $cmd\n";
84
85	`$cmd`;
86	}
87
88	if (-z $downloadto_fname) {
89	print STDERR "Warning: downloaded file 0 bytes!\n";
90	}
91	}
92
93
94	sub parse_gi_search_page
95	{
96	my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
97
98	my $nextpage_url = undef;
99
100	my @imgref_urls = ();
101
102	my $downloaded_text = readin_html($downloaded_fname);
103	if (defined $downloaded_text) {
104	my @anchor_tags = stripout_anchortags($downloaded_text);
105
106	my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
107	my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
108
109	my @thumbimg_urls = extract_urls(@thumbimg_tags);
110	my @nextpage_urls = extract_urls(@nextpage_tags);
111
112	my $curr_start = 0;
113	if ($currpage_url =~ m/start=(\d+)/) {
114	$curr_start = $1;
115	}
116
117	my $pot_url;
118	foreach $pot_url (@nextpage_urls) {
119
120	my ($next_start) = ($pot_url =~ m/start=(\d+)/);
121	if ($next_start>$curr_start) {
122	$nextpage_url = $pot_url;
123	last;
124	}
125	}
126
127	# print "-" x 40, "\n";
128	my $c = 1;
129	my $p = 1;
130
131	foreach my $tvu (@thumbimg_urls) {
132	my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
133	$img_url =~ s/%25/%/g;
134
135	my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
136	## print STDERR "****imgref_url = $imgref_url\n";
137	$imgref_url =~ s/%25/%/g;
138
139	my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
140	$img_ext = lc($img_ext);
141
142	# remove http:// if there, so later we can explicitly add it in
143	$img_url =~ s/^http:\/\///;
144
145	print "Downloading image url http://$img_url\n";
146	my $output_fname = "$search_term_dir/img_$c.$img_ext";
147
148	get_gi_page("http://",$img_url,$output_fname);
149
150	if (-s $output_fname == 0) {
151	unlink $output_fname;
152	}
153	else {
154	# need to shift the $? exit code returned by system() by 8 bits and
155	# then convert it to a signed value to work out whether it is indeed > 0
156	my $status = system("\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\"");
157	$status >>= 8;
158	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
159
160	if($status > 0 ) {
161	print STDERR "**** NOT JPEG: output_fname \n";
162	unlink $output_fname;
163	}
164	else {
165	my $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\" 2>&1";
166	my $result = `$command`;
167
168	my $type = 'unknown';
169	my $width = 'unknown';
170	my $height = 'unknown';
171
172	my $image_safe = quotemeta $output_fname;
173	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
174	$type = $1;
175	$width = $2;
176	$height = $3;
177	}
178
179	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
180
181	if (($width ne "unknown") && ($height ne "unknown")) {
182	if (($width>200) \|\| ($height>200)) {
183	`$imagick_cmd convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
184	`/bin/mv /tmp/x.jpg \"$output_fname\"`;
185	}
186	}
187	$c++;
188	}
189	}
190
191	push(@imgref_urls,$imgref_url);
192
193	last if ($c==3); # Only take first 2
194
195	$p++;
196
197	if ($p==20) {
198	print STDERR "*** Unable to get enough images after 20 passes\n";
199	last;
200	}
201
202
203	}
204
205	if (defined $nextpage_url) {
206	print "Next page URL:\n";
207	print_tags($nextpage_url);
208	}
209	# print "-" x 40, "\n";
210	}
211
212	return ($nextpage_url, \@imgref_urls);
213	}
214
215	sub make_search_term_safe
216	{
217	my ($search_terms) = @_;
218
219	my $search_term_safe = join("+",@$search_terms);
220	$search_term_safe =~ s/\"/%22/g;
221	$search_term_safe =~ s/ /+/g;
222
223	return $search_term_safe;
224	}
225
226	sub gi_query_url
227	{
228	my ($search_term) = @_;
229
230	my $search_term_safe = make_search_term_safe($search_term);
231
232	my $nextpage_url
233	= "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
234	$nextpage_url .= "\&q=$search_term_safe";
235
236	return $nextpage_url;
237	}
238
239	sub gi_url_base
240	{
241	return "http://images.google.com";
242	}
243
244	sub giget
245	{
246	my ($search_terms,$output_dir) = @_;
247	my $imgref_urls = [];
248
249	if (!-e $output_dir) {
250	mkdir($output_dir);
251
252	}
253
254	print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
255
256	my $gi_base = gi_url_base();
257	my $nextpage_url = gi_query_url($search_terms);
258
259	my $respage_fname = "$output_dir/respage1.html";
260	get_gi_page($gi_base,$nextpage_url,$respage_fname);
261
262	($nextpage_url, $imgref_urls)
263	= parse_gi_search_page($gi_base,$output_dir,
264	$respage_fname,$nextpage_url);
265	# else {
266	# print STDERR " Images already mirrored\n";
267	# }
268
269	print STDERR "-" x 40, "\n";
270
271	return $imgref_urls;
272	}
273
274
275	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: