source: gsdl/trunk/perllib/giget.pm@ 18342

Last change on this file since 18342 was 15889, checked in by mdewsnip, 16 years ago

Added "use strict", and fixed resulting problems.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1use strict;
2
3
4sub readin_html
5{
6 my ($html_fname) = @_;
7
8 open(HIN,"<$html_fname")
9 || die "Unable to open $html_fname: $!\n";
10
11 my $html_text;
12 my $line;
13 while (defined ($line=<HIN>)) {
14 $html_text .= $line;
15 }
16 close(HIN);
17
18 return $html_text;
19}
20
21sub stripout_anchortags
22{
23 my ($html_text) = @_;
24
25 my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
26
27 return @anchor_tags;
28}
29
30
31sub print_tags
32{
33 my (@tags) = @_;
34
35 my $a;
36 foreach $a ( @tags) {
37 print "$a\n";
38 }
39}
40
41sub filter_tags
42{
43 my ($filter_text,@tags) = @_;
44
45 my @filtered_tags = ();
46
47 my $t;
48 foreach $t (@tags) {
49 if ($t =~ m/$filter_text/x) {
50 push(@filtered_tags,$t);
51 }
52 }
53
54 return @filtered_tags;
55}
56
57sub extract_urls {
58 my (@tags) = @_;
59
60 my @urls = ();
61
62 my $t;
63 foreach $t (@tags) {
64 if ($t =~ m/href=([^ ]+)/i) {
65 my $url = $1;
66 $url =~ s/&amp;/&/g;
67 push(@urls,$url);
68 }
69 }
70
71 return @urls;
72}
73
74sub get_gi_page
75{
76 my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
77
78 my $full_url = "$cgi_base$cgi_call";
79
80 if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
81 my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
82## print STDERR "*** wget cmd:\n $cmd\n";
83
84 `$cmd`;
85 }
86
87 if (-z $downloadto_fname) {
88 print STDERR "Warning: downloaded file 0 bytes!\n";
89 }
90}
91
92
93sub parse_gi_search_page
94{
95 my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
96
97 my $nextpage_url = undef;
98
99 my @imgref_urls = ();
100
101 my $downloaded_text = readin_html($downloaded_fname);
102 if (defined $downloaded_text) {
103 my @anchor_tags = stripout_anchortags($downloaded_text);
104
105 my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
106 my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
107
108 my @thumbimg_urls = extract_urls(@thumbimg_tags);
109 my @nextpage_urls = extract_urls(@nextpage_tags);
110
111 my $curr_start = 0;
112 if ($currpage_url =~ m/start=(\d+)/) {
113 $curr_start = $1;
114 }
115
116 my $pot_url;
117 foreach $pot_url (@nextpage_urls) {
118
119 my ($next_start) = ($pot_url =~ m/start=(\d+)/);
120 if ($next_start>$curr_start) {
121 $nextpage_url = $pot_url;
122 last;
123 }
124 }
125
126# print "-" x 40, "\n";
127 my $c = 1;
128 my $p = 1;
129
130 foreach my $tvu (@thumbimg_urls) {
131 my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
132 $img_url =~ s/%25/%/g;
133
134 my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
135## print STDERR "****imgref_url = $imgref_url\n";
136 $imgref_url =~ s/%25/%/g;
137
138 my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
139 $img_ext = lc($img_ext);
140
141 # remove http:// if there, so later we can explicitly add it in
142 $img_url =~ s/^http:\/\///;
143
144 print "Downloading image url http://$img_url\n";
145 my $output_fname = "$search_term_dir/img_$c.$img_ext";
146
147 get_gi_page("http://",$img_url,$output_fname);
148
149 if (-s $output_fname == 0) {
150 unlink $output_fname;
151 }
152 elsif (system("identify \"$output_fname\"") > 0 ) {
153 print STDERR "**** NOT JPEG: output_fname \n";
154 unlink $output_fname;
155 }
156 else {
157
158 my $command = "identify \"$output_fname\" 2>&1";
159 my $result = `$command`;
160
161 my $type = 'unknown';
162 my $width = 'unknown';
163 my $height = 'unknown';
164
165 my $image_safe = quotemeta $output_fname;
166 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
167 $type = $1;
168 $width = $2;
169 $height = $3;
170 }
171
172 if (($width ne "unknown") && ($height ne "unknown")) {
173 if (($width>200) || ($height>200)) {
174 `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
175 `/bin/mv /tmp/x.jpg \"$output_fname\"`;
176 }
177 }
178 $c++;
179 }
180
181 push(@imgref_urls,$imgref_url);
182
183 last if ($c==3); # Only take first 2
184
185 $p++;
186
187 if ($p==20) {
188 print STDERR "*** Unable to get enough images after 20 passes\n";
189 last;
190 }
191
192
193 }
194
195 if (defined $nextpage_url) {
196 print "Next page URL:\n";
197 print_tags($nextpage_url);
198 }
199# print "-" x 40, "\n";
200 }
201
202 return ($nextpage_url, \@imgref_urls);
203}
204
205sub make_search_term_safe
206{
207 my ($search_terms) = @_;
208
209 my $search_term_safe = join("+",@$search_terms);
210 $search_term_safe =~ s/\"/%22/g;
211 $search_term_safe =~ s/ /+/g;
212
213 return $search_term_safe;
214}
215
216sub gi_query_url
217{
218 my ($search_term) = @_;
219
220 my $search_term_safe = make_search_term_safe($search_term);
221
222 my $nextpage_url
223 = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
224 $nextpage_url .= "\&q=$search_term_safe";
225
226 return $nextpage_url;
227}
228
229sub gi_url_base
230{
231 return "http://images.google.com";
232}
233
234sub giget
235{
236 my ($search_terms,$output_dir) = @_;
237 my $imgref_urls = [];
238
239 if (!-e $output_dir) {
240 mkdir($output_dir);
241
242 }
243
244 print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
245
246 my $gi_base = gi_url_base();
247 my $nextpage_url = gi_query_url($search_terms);
248
249 my $respage_fname = "$output_dir/respage1.html";
250 get_gi_page($gi_base,$nextpage_url,$respage_fname);
251
252 ($nextpage_url, $imgref_urls)
253 = parse_gi_search_page($gi_base,$output_dir,
254 $respage_fname,$nextpage_url);
255# else {
256# print STDERR " Images already mirrored\n";
257# }
258
259 print STDERR "-" x 40, "\n";
260
261 return $imgref_urls;
262}
263
264
2651;
Note: See TracBrowser for help on using the repository browser.