source: main/trunk/greenstone2/perllib/giget.pm@ 24754

Last change on this file since 24754 was 24601, checked in by ak19, 13 years ago

Dr Bainbridge suggested corrections to commits of revision 24600: 1. gs-magick.pl: close call on Pipe only if successfully opened. 2. Command_status always needs to be shifted and turned into its signed value for display in convertutil.pm. 3. giget calls to imagemagick more eficient: doesn't call identify twice, but just once since the exit code and output to STDOUT can both be inspected after just one call. Moreover, exit code needed to be tested for equality against 0, not whether it is greater than 0, so no shifting and converting to signed value required.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
Line 
1use strict;
2use util;
3
4
5sub readin_html
6{
7 my ($html_fname) = @_;
8
9 open(HIN,"<$html_fname")
10 || die "Unable to open $html_fname: $!\n";
11
12 my $html_text;
13 my $line;
14 while (defined ($line=<HIN>)) {
15 $html_text .= $line;
16 }
17 close(HIN);
18
19 return $html_text;
20}
21
22sub stripout_anchortags
23{
24 my ($html_text) = @_;
25
26 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
27
28 return @anchor_tags;
29}
30
31
32sub print_tags
33{
34 my (@tags) = @_;
35
36 my $a;
37 foreach $a ( @tags) {
38 print "$a\n";
39 }
40}
41
42sub filter_tags
43{
44 my ($filter_text,@tags) = @_;
45
46 my @filtered_tags = ();
47
48 my $t;
49 foreach $t (@tags) {
50 if ($t =~ m/$filter_text/x) {
51 push(@filtered_tags,$t);
52 }
53 }
54
55 return @filtered_tags;
56}
57
58sub extract_urls {
59 my (@tags) = @_;
60
61 my @urls = ();
62
63 my $t;
64 foreach $t (@tags) {
65 if ($t =~ m/href=([^ ]+)/i) {
66 my $url = $1;
67 $url =~ s/&amp;/&/g;
68 push(@urls,$url);
69 }
70 }
71
72 return @urls;
73}
74
75sub get_gi_page
76{
77 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
78
79 my $full_url = "$cgi_base$cgi_call";
80
81 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
82 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
83## print STDERR "*** wget cmd:\n $cmd\n";
84
85 `$cmd`;
86 }
87
88 if (-z $downloadto_fname) {
89 print STDERR "Warning: downloaded file 0 bytes!\n";
90 }
91}
92
93
94sub parse_gi_search_page
95{
96 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
97
98 my $nextpage_url = undef;
99
100 my @imgref_urls = ();
101
102 my $downloaded_text = readin_html($downloaded_fname);
103 if (defined $downloaded_text) {
104 my @anchor_tags = stripout_anchortags($downloaded_text);
105
106 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
107 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
108
109 my @thumbimg_urls = extract_urls(@thumbimg_tags);
110 my @nextpage_urls = extract_urls(@nextpage_tags);
111
112 my $curr_start = 0;
113 if ($currpage_url =~ m/start=(\d+)/) {
114 $curr_start = $1;
115 }
116
117 my $pot_url;
118 foreach $pot_url (@nextpage_urls) {
119
120 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
121 if ($next_start>$curr_start) {
122 $nextpage_url = $pot_url;
123 last;
124 }
125 }
126
127# print "-" x 40, "\n";
128 my $c = 1;
129 my $p = 1;
130
131 foreach my $tvu (@thumbimg_urls) {
132 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
133 $img_url =~ s/%25/%/g;
134
135 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
136## print STDERR "****imgref_url = $imgref_url\n";
137 $imgref_url =~ s/%25/%/g;
138
139 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
140 $img_ext = lc($img_ext);
141
142 # remove http:// if there, so later we can explicitly add it in
143 $img_url =~ s/^http:\/\///;
144
145 print "Downloading image url http://$img_url\n";
146 my $output_fname = "$search_term_dir/img_$c.$img_ext";
147
148 get_gi_page("http://",$img_url,$output_fname);
149
150 if (-s $output_fname == 0) {
151 unlink $output_fname;
152 }
153 else {
154 my $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\" 2>&1";
155 my $result = `$command`;
156
157 my $status = $?;
158 # need to shift the $? exit code returned by system() by 8 bits and
159 # then convert it to a signed value to work out whether it is indeed > 0
160 #$status >>= 8;
161 #$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
162 #if($status > 0 ) {
163 if($status != 0 ) {
164 print STDERR "**** NOT JPEG: output_fname \n";
165 unlink $output_fname;
166 }
167 else {
168
169 my $type = 'unknown';
170 my $width = 'unknown';
171 my $height = 'unknown';
172
173 my $image_safe = quotemeta $output_fname;
174 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
175 $type = $1;
176 $width = $2;
177 $height = $3;
178 }
179
180 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
181
182 if (($width ne "unknown") && ($height ne "unknown")) {
183 if (($width>200) || ($height>200)) {
184 `$imagick_cmd convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
185 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
186 }
187 }
188 $c++;
189 }
190 }
191
192 push(@imgref_urls,$imgref_url);
193
194 last if ($c==3); # Only take first 2
195
196 $p++;
197
198 if ($p==20) {
199 print STDERR "*** Unable to get enough images after 20 passes\n";
200 last;
201 }
202
203
204 }
205
206 if (defined $nextpage_url) {
207 print "Next page URL:\n";
208 print_tags($nextpage_url);
209 }
210# print "-" x 40, "\n";
211 }
212
213 return ($nextpage_url, \@imgref_urls);
214}
215
216sub make_search_term_safe
217{
218 my ($search_terms) = @_;
219
220 my $search_term_safe = join("+",@$search_terms);
221 $search_term_safe =~ s/\"/%22/g;
222 $search_term_safe =~ s/ /+/g;
223
224 return $search_term_safe;
225}
226
227sub gi_query_url
228{
229 my ($search_term) = @_;
230
231 my $search_term_safe = make_search_term_safe($search_term);
232
233 my $nextpage_url
234 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
235 $nextpage_url .= "\&q=$search_term_safe";
236
237 return $nextpage_url;
238}
239
240sub gi_url_base
241{
242 return "http://images.google.com";
243}
244
245sub giget
246{
247 my ($search_terms,$output_dir) = @_;
248 my $imgref_urls = [];
249
250 if (!-e $output_dir) {
251 mkdir($output_dir);
252
253 }
254
255 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
256
257 my $gi_base = gi_url_base();
258 my $nextpage_url = gi_query_url($search_terms);
259
260 my $respage_fname = "$output_dir/respage1.html";
261 get_gi_page($gi_base,$nextpage_url,$respage_fname);
262
263 ($nextpage_url, $imgref_urls)
264 = parse_gi_search_page($gi_base,$output_dir,
265 $respage_fname,$nextpage_url);
266# else {
267# print STDERR " Images already mirrored\n";
268# }
269
270 print STDERR "-" x 40, "\n";
271
272 return $imgref_urls;
273}
274
275
2761;
Note: See TracBrowser for help on using the repository browser.