source: gs2-extensions/parallel-building/trunk/src/perllib/giget.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 5.6 KB
Line 
1use strict;
2use util;
3
4
5sub readin_html
6{
7 my ($html_fname) = @_;
8
9 open(HIN,"<$html_fname")
10 || die "Unable to open $html_fname: $!\n";
11
12 my $html_text;
13 my $line;
14 while (defined ($line=<HIN>)) {
15 $html_text .= $line;
16 }
17 close(HIN);
18
19 return $html_text;
20}
21
22sub stripout_anchortags
23{
24 my ($html_text) = @_;
25
26 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
27
28 return @anchor_tags;
29}
30
31
32sub print_tags
33{
34 my (@tags) = @_;
35
36 my $a;
37 foreach $a ( @tags) {
38 print "$a\n";
39 }
40}
41
42sub filter_tags
43{
44 my ($filter_text,@tags) = @_;
45
46 my @filtered_tags = ();
47
48 my $t;
49 foreach $t (@tags) {
50 if ($t =~ m/$filter_text/x) {
51 push(@filtered_tags,$t);
52 }
53 }
54
55 return @filtered_tags;
56}
57
58sub extract_urls {
59 my (@tags) = @_;
60
61 my @urls = ();
62
63 my $t;
64 foreach $t (@tags) {
65 if ($t =~ m/href=([^ ]+)/i) {
66 my $url = $1;
67 $url =~ s/&amp;/&/g;
68 push(@urls,$url);
69 }
70 }
71
72 return @urls;
73}
74
75sub get_gi_page
76{
77 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
78
79 my $full_url = "$cgi_base$cgi_call";
80
81 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
82 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
83## print STDERR "*** wget cmd:\n $cmd\n";
84
85 `$cmd`;
86 }
87
88 if (-z $downloadto_fname) {
89 print STDERR "Warning: downloaded file 0 bytes!\n";
90 }
91}
92
93
94sub parse_gi_search_page
95{
96 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
97
98 my $nextpage_url = undef;
99
100 my @imgref_urls = ();
101
102 my $downloaded_text = readin_html($downloaded_fname);
103 if (defined $downloaded_text) {
104 my @anchor_tags = stripout_anchortags($downloaded_text);
105
106 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
107 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
108
109 my @thumbimg_urls = extract_urls(@thumbimg_tags);
110 my @nextpage_urls = extract_urls(@nextpage_tags);
111
112 my $curr_start = 0;
113 if ($currpage_url =~ m/start=(\d+)/) {
114 $curr_start = $1;
115 }
116
117 my $pot_url;
118 foreach $pot_url (@nextpage_urls) {
119
120 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
121 if ($next_start>$curr_start) {
122 $nextpage_url = $pot_url;
123 last;
124 }
125 }
126
127# print "-" x 40, "\n";
128 my $c = 1;
129 my $p = 1;
130
131 foreach my $tvu (@thumbimg_urls) {
132 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
133 $img_url =~ s/%25/%/g;
134
135 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
136## print STDERR "****imgref_url = $imgref_url\n";
137 $imgref_url =~ s/%25/%/g;
138
139 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
140 $img_ext = lc($img_ext);
141
142 # remove http:// if there, so later we can explicitly add it in
143 $img_url =~ s/^http:\/\///;
144
145 print "Downloading image url http://$img_url\n";
146 my $output_fname = "$search_term_dir/img_$c.$img_ext";
147
148 get_gi_page("http://",$img_url,$output_fname);
149
150 if (-s $output_fname == 0) {
151 unlink $output_fname;
152 }
153 else {
154 my $command = "\"".&util::get_perl_exec()."\" -S gs-magick.pl identify \"$output_fname\" 2>&1";
155 my $result = `$command`;
156
157 my $status = $?;
158 # need to shift the $? exit code returned by system() by 8 bits and
159 # then convert it to a signed value to work out whether it is indeed > 0
160 #$status >>= 8;
161 #$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
162 #if($status > 0 ) {
163 if($status != 0 ) {
164 print STDERR "**** NOT JPEG: output_fname \n";
165 unlink $output_fname;
166 }
167 else {
168
169 my $type = 'unknown';
170 my $width = 'unknown';
171 my $height = 'unknown';
172
173 my $image_safe = quotemeta $output_fname;
174 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
175 $type = $1;
176 $width = $2;
177 $height = $3;
178 }
179
180 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
181
182 if (($width ne "unknown") && ($height ne "unknown")) {
183 if (($width>200) || ($height>200)) {
184 `$imagick_cmd convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
185 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
186 }
187 }
188 $c++;
189 }
190 }
191
192 push(@imgref_urls,$imgref_url);
193
194 last if ($c==3); # Only take first 2
195
196 $p++;
197
198 if ($p==20) {
199 print STDERR "*** Unable to get enough images after 20 passes\n";
200 last;
201 }
202
203
204 }
205
206 if (defined $nextpage_url) {
207 print "Next page URL:\n";
208 print_tags($nextpage_url);
209 }
210# print "-" x 40, "\n";
211 }
212
213 return ($nextpage_url, \@imgref_urls);
214}
215
216sub make_search_term_safe
217{
218 my ($search_terms) = @_;
219
220 my $search_term_safe = join("+",@$search_terms);
221 $search_term_safe =~ s/\"/%22/g;
222 $search_term_safe =~ s/ /+/g;
223
224 return $search_term_safe;
225}
226
227sub gi_query_url
228{
229 my ($search_term) = @_;
230
231 my $search_term_safe = make_search_term_safe($search_term);
232
233 my $nextpage_url
234 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
235 $nextpage_url .= "\&q=$search_term_safe";
236
237 return $nextpage_url;
238}
239
240sub gi_url_base
241{
242 return "http://images.google.com";
243}
244
245sub giget
246{
247 my ($search_terms,$output_dir) = @_;
248 my $imgref_urls = [];
249
250 if (!-e $output_dir) {
251 mkdir($output_dir);
252
253 }
254
255 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
256
257 my $gi_base = gi_url_base();
258 my $nextpage_url = gi_query_url($search_terms);
259
260 my $respage_fname = "$output_dir/respage1.html";
261 get_gi_page($gi_base,$nextpage_url,$respage_fname);
262
263 ($nextpage_url, $imgref_urls)
264 = parse_gi_search_page($gi_base,$output_dir,
265 $respage_fname,$nextpage_url);
266# else {
267# print STDERR " Images already mirrored\n";
268# }
269
270 print STDERR "-" x 40, "\n";
271
272 return $imgref_urls;
273}
274
275
2761;
Note: See TracBrowser for help on using the repository browser.