source: gsdl/trunk/perllib/giget.pm@ 14374

Last change on this file since 14374 was 10112, checked in by davidb, 19 years ago

Minor tweak to pretty printing of "Searching Google images for"

  • Property svn:keywords set to Author Date Id Revision
File size: 5.1 KB
Line 
1
2sub readin_html
3{
4 my ($html_fname) = @_;
5
6 open(HIN,"<$html_fname")
7 || die "Unable to open $html_fname: $!\n";
8
9 my $html_text;
10 my $line;
11 while (defined ($line=<HIN>)) {
12 $html_text .= $line;
13 }
14 close(HIN);
15
16 return $html_text;
17}
18
19sub stripout_anchortags
20{
21 my ($html_text) = @_;
22
23 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
24
25 return @anchor_tags;
26}
27
28
29sub print_tags
30{
31 my (@tags) = @_;
32
33 my $a;
34 foreach $a ( @tags) {
35 print "$a\n";
36 }
37}
38
39sub filter_tags
40{
41 my ($filter_text,@tags) = @_;
42
43 my @filtered_tags = ();
44
45 my $t;
46 foreach $t (@tags) {
47 if ($t =~ m/$filter_text/x) {
48 push(@filtered_tags,$t);
49 }
50 }
51
52 return @filtered_tags;
53}
54
55sub extract_urls {
56 my (@tags) = @_;
57
58 my @urls = ();
59
60 my $t;
61 foreach $t (@tags) {
62 if ($t =~ m/href=([^ ]+)/i) {
63 my $url = $1;
64 $url =~ s/&amp;/&/g;
65 push(@urls,$url);
66 }
67 }
68
69 return @urls;
70}
71
72sub get_gi_page
73{
74 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
75
76 my $full_url = "$cgi_base$cgi_call";
77
78 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
79 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
80## print STDERR "*** wget cmd:\n $cmd\n";
81
82 `$cmd`;
83 }
84
85 if (-z $downloadto_fname) {
86 print STDERR "Warning: downloaded file 0 bytes!\n";
87 }
88}
89
90
91sub parse_gi_search_page
92{
93 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
94
95 my $nextpage_url = undef;
96
97 my @imgref_urls = ();
98
99 my $downloaded_text = readin_html($downloaded_fname);
100 if (defined $downloaded_text) {
101 my @anchor_tags = stripout_anchortags($downloaded_text);
102
103 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
104 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
105
106 my @thumbimg_urls = extract_urls(@thumbimg_tags);
107 my @nextpage_urls = extract_urls(@nextpage_tags);
108
109 my $curr_start = 0;
110 if ($currpage_url =~ m/start=(\d+)/) {
111 $curr_start = $1;
112 }
113
114 my $pot_url;
115 foreach $pot_url (@nextpage_urls) {
116
117 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
118 if ($next_start>$curr_start) {
119 $nextpage_url = $pot_url;
120 last;
121 }
122 }
123
124# print "-" x 40, "\n";
125 my $c = 1;
126 my $p = 1;
127
128 foreach my $tvu (@thumbimg_urls) {
129 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
130 $img_url =~ s/%25/%/g;
131
132 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
133## print STDERR "****imgref_url = $imgref_url\n";
134 $imgref_url =~ s/%25/%/g;
135
136 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
137 $img_ext = lc($img_ext);
138
139 # remove http:// if there, so later we can explicitly add it in
140 $img_url =~ s/^http:\/\///;
141
142 print "Downloading image url http://$img_url\n";
143 my $output_fname = "$search_term_dir/img_$c.$img_ext";
144
145 get_gi_page("http://",$img_url,$output_fname);
146
147 if (-s $output_fname == 0) {
148 unlink $output_fname;
149 }
150 elsif (system("identify \"$output_fname\"") > 0 ) {
151 print STDERR "**** NOT JPEG: output_fname \n";
152 unlink $output_fname;
153 }
154 else {
155
156 my $command = "identify \"$output_fname\" 2>&1";
157 my $result = `$command`;
158
159 my $type = 'unknown';
160 my $width = 'unknown';
161 my $height = 'unknown';
162
163 my $image_safe = quotemeta $output_fname;
164 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
165 $type = $1;
166 $width = $2;
167 $height = $3;
168 }
169
170 if (($width ne "unknown") && ($height ne "unknown")) {
171 if (($width>200) || ($height>200)) {
172 `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
173 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
174 }
175 }
176 $c++;
177 }
178
179 push(@imgref_urls,$imgref_url);
180
181 last if ($c==3); # Only take first 2
182
183 $p++;
184
185 if ($p==20) {
186 print STDERR "*** Unable to get enough images after 20 passes\n";
187 last;
188 }
189
190
191 }
192
193 if (defined $nextpage_url) {
194 print "Next page URL:\n";
195 print_tags($nextpage_url);
196 }
197# print "-" x 40, "\n";
198 }
199
200 return ($nextpage_url, \@imgref_urls);
201}
202
203sub make_search_term_safe
204{
205 my ($search_terms) = @_;
206
207 my $search_term_safe = join("+",@$search_terms);
208 $search_term_safe =~ s/\"/%22/g;
209 $search_term_safe =~ s/ /+/g;
210
211 return $search_term_safe;
212}
213
214sub gi_query_url
215{
216 my ($search_term) = @_;
217
218 my $search_term_safe = make_search_term_safe($search_term);
219
220 my $nextpage_url
221 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
222 $nextpage_url .= "\&q=$search_term_safe";
223
224 return $nextpage_url;
225}
226
227sub gi_url_base
228{
229 return "http://images.google.com";
230}
231
232sub giget
233{
234 my ($search_terms,$output_dir) = @_;
235 my $imgref_urls = [];
236
237 if (!-e $output_dir) {
238 mkdir($output_dir);
239
240 }
241
242 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
243
244 my $gi_base = gi_url_base();
245 my $nextpage_url = gi_query_url($search_terms);
246
247 $respage_fname = "$output_dir/respage1.html";
248 get_gi_page($gi_base,$nextpage_url,$respage_fname);
249
250 ($nextpage_url, $imgref_urls)
251 = parse_gi_search_page($gi_base,$output_dir,
252 $respage_fname,$nextpage_url);
253# else {
254# print STDERR " Images already mirrored\n";
255# }
256
257 print STDERR "-" x 40, "\n";
258
259 return $imgref_urls;
260}
261
262
2631;
Note: See TracBrowser for help on using the repository browser.