source: main/trunk/greenstone2/perllib/giget.pm@ 24600

Last change on this file since 24600 was 24600, checked in by ak19, 13 years ago

Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the *binary* that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1use strict;
2use util;
3
4
5sub readin_html
6{
7 my ($html_fname) = @_;
8
9 open(HIN,"<$html_fname")
10 || die "Unable to open $html_fname: $!\n";
11
12 my $html_text;
13 my $line;
14 while (defined ($line=<HIN>)) {
15 $html_text .= $line;
16 }
17 close(HIN);
18
19 return $html_text;
20}
21
22sub stripout_anchortags
23{
24 my ($html_text) = @_;
25
26 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
27
28 return @anchor_tags;
29}
30
31
32sub print_tags
33{
34 my (@tags) = @_;
35
36 my $a;
37 foreach $a ( @tags) {
38 print "$a\n";
39 }
40}
41
42sub filter_tags
43{
44 my ($filter_text,@tags) = @_;
45
46 my @filtered_tags = ();
47
48 my $t;
49 foreach $t (@tags) {
50 if ($t =~ m/$filter_text/x) {
51 push(@filtered_tags,$t);
52 }
53 }
54
55 return @filtered_tags;
56}
57
58sub extract_urls {
59 my (@tags) = @_;
60
61 my @urls = ();
62
63 my $t;
64 foreach $t (@tags) {
65 if ($t =~ m/href=([^ ]+)/i) {
66 my $url = $1;
67 $url =~ s/&amp;/&/g;
68 push(@urls,$url);
69 }
70 }
71
72 return @urls;
73}
74
75sub get_gi_page
76{
77 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
78
79 my $full_url = "$cgi_base$cgi_call";
80
81 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
82 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
83## print STDERR "*** wget cmd:\n $cmd\n";
84
85 `$cmd`;
86 }
87
88 if (-z $downloadto_fname) {
89 print STDERR "Warning: downloaded file 0 bytes!\n";
90 }
91}
92
93
94sub parse_gi_search_page
95{
96 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
97
98 my $nextpage_url = undef;
99
100 my @imgref_urls = ();
101
102 my $downloaded_text = readin_html($downloaded_fname);
103 if (defined $downloaded_text) {
104 my @anchor_tags = stripout_anchortags($downloaded_text);
105
106 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
107 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
108
109 my @thumbimg_urls = extract_urls(@thumbimg_tags);
110 my @nextpage_urls = extract_urls(@nextpage_tags);
111
112 my $curr_start = 0;
113 if ($currpage_url =~ m/start=(\d+)/) {
114 $curr_start = $1;
115 }
116
117 my $pot_url;
118 foreach $pot_url (@nextpage_urls) {
119
120 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
121 if ($next_start>$curr_start) {
122 $nextpage_url = $pot_url;
123 last;
124 }
125 }
126
127# print "-" x 40, "\n";
128 my $c = 1;
129 my $p = 1;
130
131 foreach my $tvu (@thumbimg_urls) {
132 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
133 $img_url =~ s/%25/%/g;
134
135 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
136## print STDERR "****imgref_url = $imgref_url\n";
137 $imgref_url =~ s/%25/%/g;
138
139 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
140 $img_ext = lc($img_ext);
141
142 # remove http:// if there, so later we can explicitly add it in
143 $img_url =~ s/^http:\/\///;
144
145 print "Downloading image url http://$img_url\n";
146 my $output_fname = "$search_term_dir/img_$c.$img_ext";
147
148 get_gi_page("http://",$img_url,$output_fname);
149
150 if (-s $output_fname == 0) {
151 unlink $output_fname;
152 }
153 else {
154 # need to shift the $? exit code returned by system() by 8 bits and
155 # then convert it to a signed value to work out whether it is indeed > 0
156 my $status = system("\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\"");
157 $status >>= 8;
158 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
159
160 if($status > 0 ) {
161 print STDERR "**** NOT JPEG: output_fname \n";
162 unlink $output_fname;
163 }
164 else {
165 my $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\" 2>&1";
166 my $result = `$command`;
167
168 my $type = 'unknown';
169 my $width = 'unknown';
170 my $height = 'unknown';
171
172 my $image_safe = quotemeta $output_fname;
173 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
174 $type = $1;
175 $width = $2;
176 $height = $3;
177 }
178
179 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
180
181 if (($width ne "unknown") && ($height ne "unknown")) {
182 if (($width>200) || ($height>200)) {
183 `$imagick_cmd convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
184 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
185 }
186 }
187 $c++;
188 }
189 }
190
191 push(@imgref_urls,$imgref_url);
192
193 last if ($c==3); # Only take first 2
194
195 $p++;
196
197 if ($p==20) {
198 print STDERR "*** Unable to get enough images after 20 passes\n";
199 last;
200 }
201
202
203 }
204
205 if (defined $nextpage_url) {
206 print "Next page URL:\n";
207 print_tags($nextpage_url);
208 }
209# print "-" x 40, "\n";
210 }
211
212 return ($nextpage_url, \@imgref_urls);
213}
214
215sub make_search_term_safe
216{
217 my ($search_terms) = @_;
218
219 my $search_term_safe = join("+",@$search_terms);
220 $search_term_safe =~ s/\"/%22/g;
221 $search_term_safe =~ s/ /+/g;
222
223 return $search_term_safe;
224}
225
226sub gi_query_url
227{
228 my ($search_term) = @_;
229
230 my $search_term_safe = make_search_term_safe($search_term);
231
232 my $nextpage_url
233 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
234 $nextpage_url .= "\&q=$search_term_safe";
235
236 return $nextpage_url;
237}
238
239sub gi_url_base
240{
241 return "http://images.google.com";
242}
243
244sub giget
245{
246 my ($search_terms,$output_dir) = @_;
247 my $imgref_urls = [];
248
249 if (!-e $output_dir) {
250 mkdir($output_dir);
251
252 }
253
254 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
255
256 my $gi_base = gi_url_base();
257 my $nextpage_url = gi_query_url($search_terms);
258
259 my $respage_fname = "$output_dir/respage1.html";
260 get_gi_page($gi_base,$nextpage_url,$respage_fname);
261
262 ($nextpage_url, $imgref_urls)
263 = parse_gi_search_page($gi_base,$output_dir,
264 $respage_fname,$nextpage_url);
265# else {
266# print STDERR " Images already mirrored\n";
267# }
268
269 print STDERR "-" x 40, "\n";
270
271 return $imgref_urls;
272}
273
274
2751;
Note: See TracBrowser for help on using the repository browser.