source: main/tags/2.52/gsdl/perllib/giget.pm@ 25422

Last change on this file since 25422 was 7492, checked in by davidb, 20 years ago

giget.pm is a module for accessing Google Images. Used by MP3Plug
to find associated images based on Title and Artist metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1
2sub readin_html
3{
4 my ($html_fname) = @_;
5
6 open(HIN,"<$html_fname")
7 || die "Unable to open $html_fname: $!\n";
8
9 my $html_text;
10 my $line;
11 while (defined ($line=<HIN>)) {
12 $html_text .= $line;
13 }
14 close(HIN);
15
16 return $html_text;
17}
18
19sub stripout_anchortags
20{
21 my ($html_text) = @_;
22
23 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
24
25 return @anchor_tags;
26}
27
28
29sub print_tags
30{
31 my (@tags) = @_;
32
33 my $a;
34 foreach $a ( @tags) {
35 print "$a\n";
36 }
37}
38
39sub filter_tags
40{
41 my ($filter_text,@tags) = @_;
42
43 my @filtered_tags = ();
44
45 my $t;
46 foreach $t (@tags) {
47 if ($t =~ m/$filter_text/x) {
48 push(@filtered_tags,$t);
49 }
50 }
51
52 return @filtered_tags;
53}
54
55sub extract_urls {
56 my (@tags) = @_;
57
58 my @urls = ();
59
60 my $t;
61 foreach $t (@tags) {
62 if ($t =~ m/href=([^ ]+)/i) {
63 my $url = $1;
64 $url =~ s/&amp;/&/g;
65 push(@urls,$url);
66 }
67 }
68
69 return @urls;
70}
71
72sub get_gi_page
73{
74 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
75
76 my $full_url = "$cgi_base$cgi_call";
77
78 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
79 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
80 `$cmd`;
81 }
82
83 if (-z $downloadto_fname) {
84 print STDERR "Warning: downloaded file 0 bytes!\n";
85 }
86}
87
88
89sub parse_gi_search_page
90{
91 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
92
93 my $nextpage_url = undef;
94
95 my @imgref_urls = ();
96
97 my $downloaded_text = readin_html($downloaded_fname);
98 if (defined $downloaded_text) {
99 my @anchor_tags = stripout_anchortags($downloaded_text);
100
101 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
102 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
103
104 my @thumbimg_urls = extract_urls(@thumbimg_tags);
105 my @nextpage_urls = extract_urls(@nextpage_tags);
106
107 my $curr_start = 0;
108 if ($currpage_url =~ m/start=(\d+)/) {
109 $curr_start = $1;
110 }
111
112 my $pot_url;
113 foreach $pot_url (@nextpage_urls) {
114
115 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
116 if ($next_start>$curr_start) {
117 $nextpage_url = $pot_url;
118 last;
119 }
120 }
121
122# print "-" x 40, "\n";
123 my $c = 1;
124 my $p = 1;
125
126 foreach my $tvu (@thumbimg_urls) {
127 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
128 $img_url =~ s/%25/%/g;
129
130 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
131## print STDERR "****imgref_url = $imgref_url\n";
132 $imgref_url =~ s/%25/%/g;
133
134 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
135 $img_ext = lc($img_ext);
136
137 print "Downloading image url http://$img_url\n";
138 my $output_fname = "$search_term_dir/img_$c.$img_ext";
139
140 get_gi_page("http://",$img_url,$output_fname);
141
142 if (-s $output_fname == 0) {
143 unlink $output_fname;
144 }
145 elsif (system("identify \"$output_fname\"") > 0 ) {
146 print STDERR "**** NOT JPEG: output_fname \n";
147 unlink $output_fname;
148 }
149 else {
150
151 my $command = "identify \"$output_fname\" 2>&1";
152 my $result = `$command`;
153
154 my $type = 'unknown';
155 my $width = 'unknown';
156 my $height = 'unknown';
157
158 my $image_safe = quotemeta $output_fname;
159 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
160 $type = $1;
161 $width = $2;
162 $height = $3;
163 }
164
165 if (($width ne "unknown") && ($height ne "unknown")) {
166 if (($width>200) || ($height>200)) {
167 `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
168 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
169 }
170 }
171 $c++;
172 }
173
174 push(@imgref_urls,$imgref_url);
175
176 last if ($c==3); # Only take first 2
177
178 $p++;
179
180 if ($p==20) {
181 print STDERR "*** Unable to get enough images after 20 passes\n";
182 last;
183 }
184
185
186 }
187
188 if (defined $nextpage_url) {
189 print "Next page URL:\n";
190 print_tags($nextpage_url);
191 }
192# print "-" x 40, "\n";
193 }
194
195 return ($nextpage_url, \@imgref_urls);
196}
197
198sub make_search_term_safe
199{
200 my ($search_terms) = @_;
201
202 my $search_term_safe = join("+",@$search_terms);
203 $search_term_safe =~ s/\"/%22/g;
204 $search_term_safe =~ s/ /+/g;
205
206 return $search_term_safe;
207}
208
209sub gi_query_url
210{
211 my ($search_term) = @_;
212
213 my $search_term_safe = make_search_term_safe($search_term);
214
215 my $nextpage_url
216 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
217 $nextpage_url .= "\&q=$search_term_safe";
218
219 return $nextpage_url;
220}
221
222sub gi_url_base
223{
224 return "http://images.google.com";
225}
226
227sub giget
228{
229 my ($search_terms,$output_dir) = @_;
230 my $imgref_urls = [];
231
232 if (!-e $output_dir) {
233 mkdir($output_dir);
234
235 }
236
237 print STDERR "Searching Google Images for: ", join(" ",@$search_terms), "\n";
238
239 my $gi_base = gi_url_base();
240 my $nextpage_url = gi_query_url($search_terms);
241
242 $respage_fname = "$output_dir/respage1.html";
243 get_gi_page($gi_base,$nextpage_url,$respage_fname);
244
245 ($nextpage_url, $imgref_urls)
246 = parse_gi_search_page($gi_base,$output_dir,
247 $respage_fname,$nextpage_url);
248# else {
249# print STDERR " Images already mirrored\n";
250# }
251
252 print STDERR "-" x 40, "\n";
253
254 return $imgref_urls;
255}
256
257
2581;
Note: See TracBrowser for help on using the repository browser.