source: trunk/gsdl/perllib/giget.pm@ 8889

Last change on this file since 8889 was 8889, checked in by davidb, 19 years ago

Small modification of image URL manipulation remain consistent with
how Google Images generates its HTML.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.1 KB
Line 
1
2sub readin_html
3{
4 my ($html_fname) = @_;
5
6 open(HIN,"<$html_fname")
7 || die "Unable to open $html_fname: $!\n";
8
9 my $html_text;
10 my $line;
11 while (defined ($line=<HIN>)) {
12 $html_text .= $line;
13 }
14 close(HIN);
15
16 return $html_text;
17}
18
19sub stripout_anchortags
20{
21 my ($html_text) = @_;
22
23 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
24
25 return @anchor_tags;
26}
27
28
29sub print_tags
30{
31 my (@tags) = @_;
32
33 my $a;
34 foreach $a ( @tags) {
35 print "$a\n";
36 }
37}
38
39sub filter_tags
40{
41 my ($filter_text,@tags) = @_;
42
43 my @filtered_tags = ();
44
45 my $t;
46 foreach $t (@tags) {
47 if ($t =~ m/$filter_text/x) {
48 push(@filtered_tags,$t);
49 }
50 }
51
52 return @filtered_tags;
53}
54
55sub extract_urls {
56 my (@tags) = @_;
57
58 my @urls = ();
59
60 my $t;
61 foreach $t (@tags) {
62 if ($t =~ m/href=([^ ]+)/i) {
63 my $url = $1;
64 $url =~ s/&amp;/&/g;
65 push(@urls,$url);
66 }
67 }
68
69 return @urls;
70}
71
72sub get_gi_page
73{
74 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
75
76 my $full_url = "$cgi_base$cgi_call";
77
78 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
79 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
80 `$cmd`;
81 }
82
83 if (-z $downloadto_fname) {
84 print STDERR "Warning: downloaded file 0 bytes!\n";
85 }
86}
87
88
89sub parse_gi_search_page
90{
91 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
92
93 my $nextpage_url = undef;
94
95 my @imgref_urls = ();
96
97 my $downloaded_text = readin_html($downloaded_fname);
98 if (defined $downloaded_text) {
99 my @anchor_tags = stripout_anchortags($downloaded_text);
100
101 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
102 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
103
104 my @thumbimg_urls = extract_urls(@thumbimg_tags);
105 my @nextpage_urls = extract_urls(@nextpage_tags);
106
107 my $curr_start = 0;
108 if ($currpage_url =~ m/start=(\d+)/) {
109 $curr_start = $1;
110 }
111
112 my $pot_url;
113 foreach $pot_url (@nextpage_urls) {
114
115 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
116 if ($next_start>$curr_start) {
117 $nextpage_url = $pot_url;
118 last;
119 }
120 }
121
122# print "-" x 40, "\n";
123 my $c = 1;
124 my $p = 1;
125
126 foreach my $tvu (@thumbimg_urls) {
127 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
128 $img_url =~ s/%25/%/g;
129
130 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
131## print STDERR "****imgref_url = $imgref_url\n";
132 $imgref_url =~ s/%25/%/g;
133
134 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
135 $img_ext = lc($img_ext);
136
137 # remove http:// if there, so later we can explicitly add it in
138 $img_url =~ s/^http:\/\///;
139
140 print "Downloading image url http://$img_url\n";
141 my $output_fname = "$search_term_dir/img_$c.$img_ext";
142
143 get_gi_page("http://",$img_url,$output_fname);
144
145 if (-s $output_fname == 0) {
146 unlink $output_fname;
147 }
148 elsif (system("identify \"$output_fname\"") > 0 ) {
149 print STDERR "**** NOT JPEG: output_fname \n";
150 unlink $output_fname;
151 }
152 else {
153
154 my $command = "identify \"$output_fname\" 2>&1";
155 my $result = `$command`;
156
157 my $type = 'unknown';
158 my $width = 'unknown';
159 my $height = 'unknown';
160
161 my $image_safe = quotemeta $output_fname;
162 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
163 $type = $1;
164 $width = $2;
165 $height = $3;
166 }
167
168 if (($width ne "unknown") && ($height ne "unknown")) {
169 if (($width>200) || ($height>200)) {
170 `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
171 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
172 }
173 }
174 $c++;
175 }
176
177 push(@imgref_urls,$imgref_url);
178
179 last if ($c==3); # Only take first 2
180
181 $p++;
182
183 if ($p==20) {
184 print STDERR "*** Unable to get enough images after 20 passes\n";
185 last;
186 }
187
188
189 }
190
191 if (defined $nextpage_url) {
192 print "Next page URL:\n";
193 print_tags($nextpage_url);
194 }
195# print "-" x 40, "\n";
196 }
197
198 return ($nextpage_url, \@imgref_urls);
199}
200
201sub make_search_term_safe
202{
203 my ($search_terms) = @_;
204
205 my $search_term_safe = join("+",@$search_terms);
206 $search_term_safe =~ s/\"/%22/g;
207 $search_term_safe =~ s/ /+/g;
208
209 return $search_term_safe;
210}
211
212sub gi_query_url
213{
214 my ($search_term) = @_;
215
216 my $search_term_safe = make_search_term_safe($search_term);
217
218 my $nextpage_url
219 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
220 $nextpage_url .= "\&q=$search_term_safe";
221
222 return $nextpage_url;
223}
224
225sub gi_url_base
226{
227 return "http://images.google.com";
228}
229
230sub giget
231{
232 my ($search_terms,$output_dir) = @_;
233 my $imgref_urls = [];
234
235 if (!-e $output_dir) {
236 mkdir($output_dir);
237
238 }
239
240 print STDERR "Searching Google Images for: ", join(" ",@$search_terms), "\n";
241
242 my $gi_base = gi_url_base();
243 my $nextpage_url = gi_query_url($search_terms);
244
245 $respage_fname = "$output_dir/respage1.html";
246 get_gi_page($gi_base,$nextpage_url,$respage_fname);
247
248 ($nextpage_url, $imgref_urls)
249 = parse_gi_search_page($gi_base,$output_dir,
250 $respage_fname,$nextpage_url);
251# else {
252# print STDERR " Images already mirrored\n";
253# }
254
255 print STDERR "-" x 40, "\n";
256
257 return $imgref_urls;
258}
259
260
2611;
Note: See TracBrowser for help on using the repository browser.