source: main/trunk/greenstone2/perllib/giget.pm@ 31863

Last change on this file since 31863 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:keywords set to Author Date Id Revision
File size: 5.9 KB
Line 
1use strict;
2use util;
3
4
5sub readin_html
6{
7 my ($html_fname) = @_;
8
9 open(HIN,"<$html_fname")
10 || die "Unable to open $html_fname: $!\n";
11
12 my $html_text;
13 my $line;
14 while (defined ($line=<HIN>)) {
15 $html_text .= $line;
16 }
17 close(HIN);
18
19 return $html_text;
20}
21
22sub stripout_anchortags
23{
24 my ($html_text) = @_;
25
26 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
27
28 return @anchor_tags;
29}
30
31
32sub print_tags
33{
34 my (@tags) = @_;
35
36 my $a;
37 foreach $a ( @tags) {
38 print "$a\n";
39 }
40}
41
42sub filter_tags
43{
44 my ($filter_text,@tags) = @_;
45
46 my @filtered_tags = ();
47
48 my $t;
49 foreach $t (@tags) {
50 if ($t =~ m/$filter_text/x) {
51 push(@filtered_tags,$t);
52 }
53 }
54
55 return @filtered_tags;
56}
57
58sub extract_urls {
59 my (@tags) = @_;
60
61 my @urls = ();
62
63 my $t;
64 foreach $t (@tags) {
65 if ($t =~ m/href=([^ ]+)/i) {
66 my $url = $1;
67 $url =~ s/&amp;/&/g;
68 push(@urls,$url);
69 }
70 }
71
72 return @urls;
73}
74
75sub get_gi_page
76{
77 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
78
79 my $full_url = "$cgi_base$cgi_call";
80
81 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
82
83 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
84 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
85
86 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
87## print STDERR "*** wget cmd:\n $cmd\n";
88
89 `$cmd`;
90 }
91
92 if (-z $downloadto_fname) {
93 print STDERR "Warning: downloaded file 0 bytes!\n";
94 }
95}
96
97
98sub parse_gi_search_page
99{
100 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
101
102 my $nextpage_url = undef;
103
104 my @imgref_urls = ();
105
106 my $downloaded_text = readin_html($downloaded_fname);
107 if (defined $downloaded_text) {
108 my @anchor_tags = stripout_anchortags($downloaded_text);
109
110 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
111 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
112
113 my @thumbimg_urls = extract_urls(@thumbimg_tags);
114 my @nextpage_urls = extract_urls(@nextpage_tags);
115
116 my $curr_start = 0;
117 if ($currpage_url =~ m/start=(\d+)/) {
118 $curr_start = $1;
119 }
120
121 my $pot_url;
122 foreach $pot_url (@nextpage_urls) {
123
124 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
125 if ($next_start>$curr_start) {
126 $nextpage_url = $pot_url;
127 last;
128 }
129 }
130
131# print "-" x 40, "\n";
132 my $c = 1;
133 my $p = 1;
134
135 foreach my $tvu (@thumbimg_urls) {
136 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
137 $img_url =~ s/%25/%/g;
138
139 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
140## print STDERR "****imgref_url = $imgref_url\n";
141 $imgref_url =~ s/%25/%/g;
142
143 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
144 $img_ext = lc($img_ext);
145
146 # remove http:// if there, so later we can explicitly add it in
147 $img_url =~ s/^http:\/\///;
148
149 print "Downloading image url http://$img_url\n";
150 my $output_fname = "$search_term_dir/img_$c.$img_ext";
151
152 get_gi_page("http://",$img_url,$output_fname);
153
154 if (-s $output_fname == 0) {
155 unlink $output_fname;
156 }
157 else {
158 my $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\" 2>&1";
159 my $result = `$command`;
160
161 my $status = $?;
162 # need to shift the $? exit code returned by system() by 8 bits and
163 # then convert it to a signed value to work out whether it is indeed > 0
164 #$status >>= 8;
165 #$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
166 #if($status > 0 ) {
167 if($status != 0 ) {
168 print STDERR "**** NOT JPEG: output_fname \n";
169 unlink $output_fname;
170 }
171 else {
172
173 my $type = 'unknown';
174 my $width = 'unknown';
175 my $height = 'unknown';
176
177 my $image_safe = quotemeta $output_fname;
178 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
179 $type = $1;
180 $width = $2;
181 $height = $3;
182 }
183
184 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
185
186 if (($width ne "unknown") && ($height ne "unknown")) {
187 if (($width>200) || ($height>200)) {
188 `$imagick_cmd convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
189 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
190 }
191 }
192 $c++;
193 }
194 }
195
196 push(@imgref_urls,$imgref_url);
197
198 last if ($c==3); # Only take first 2
199
200 $p++;
201
202 if ($p==20) {
203 print STDERR "*** Unable to get enough images after 20 passes\n";
204 last;
205 }
206
207
208 }
209
210 if (defined $nextpage_url) {
211 print "Next page URL:\n";
212 print_tags($nextpage_url);
213 }
214# print "-" x 40, "\n";
215 }
216
217 return ($nextpage_url, \@imgref_urls);
218}
219
220sub make_search_term_safe
221{
222 my ($search_terms) = @_;
223
224 my $search_term_safe = join("+",@$search_terms);
225 $search_term_safe =~ s/\"/%22/g;
226 $search_term_safe =~ s/ /+/g;
227
228 return $search_term_safe;
229}
230
231sub gi_query_url
232{
233 my ($search_term) = @_;
234
235 my $search_term_safe = make_search_term_safe($search_term);
236
237 my $nextpage_url
238 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
239 $nextpage_url .= "\&q=$search_term_safe";
240
241 return $nextpage_url;
242}
243
244sub gi_url_base
245{
246 return "http://images.google.com";
247}
248
249sub giget
250{
251 my ($search_terms,$output_dir) = @_;
252 my $imgref_urls = [];
253
254 if (!-e $output_dir) {
255 mkdir($output_dir);
256
257 }
258
259 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
260
261 my $gi_base = gi_url_base();
262 my $nextpage_url = gi_query_url($search_terms);
263
264 my $respage_fname = "$output_dir/respage1.html";
265 get_gi_page($gi_base,$nextpage_url,$respage_fname);
266
267 ($nextpage_url, $imgref_urls)
268 = parse_gi_search_page($gi_base,$output_dir,
269 $respage_fname,$nextpage_url);
270# else {
271# print STDERR " Images already mirrored\n";
272# }
273
274 print STDERR "-" x 40, "\n";
275
276 return $imgref_urls;
277}
278
279
2801;
Note: See TracBrowser for help on using the repository browser.