use strict; sub readin_html { my ($html_fname) = @_; open(HIN,"<$html_fname") || die "Unable to open $html_fname: $!\n"; my $html_text; my $line; while (defined ($line=)) { $html_text .= $line; } close(HIN); return $html_text; } sub stripout_anchortags { my ($html_text) = @_; my @anchor_tags = ($html_text =~ m/()+/gs); return @anchor_tags; } sub print_tags { my (@tags) = @_; my $a; foreach $a ( @tags) { print "$a\n"; } } sub filter_tags { my ($filter_text,@tags) = @_; my @filtered_tags = (); my $t; foreach $t (@tags) { if ($t =~ m/$filter_text/x) { push(@filtered_tags,$t); } } return @filtered_tags; } sub extract_urls { my (@tags) = @_; my @urls = (); my $t; foreach $t (@tags) { if ($t =~ m/href=([^ ]+)/i) { my $url = $1; $url =~ s/&/&/g; push(@urls,$url); } } return @urls; } sub get_gi_page { my ($cgi_base,$cgi_call,$downloadto_fname) = @_; my $full_url = "$cgi_base$cgi_call"; if ((!-e $downloadto_fname) || (-z $downloadto_fname)) { my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\""; ## print STDERR "*** wget cmd:\n $cmd\n"; `$cmd`; } if (-z $downloadto_fname) { print STDERR "Warning: downloaded file 0 bytes!\n"; } } sub parse_gi_search_page { my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_; my $nextpage_url = undef; my @imgref_urls = (); my $downloaded_text = readin_html($downloaded_fname); if (defined $downloaded_text) { my @anchor_tags = stripout_anchortags($downloaded_text); my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags); my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags); my @thumbimg_urls = extract_urls(@thumbimg_tags); my @nextpage_urls = extract_urls(@nextpage_tags); my $curr_start = 0; if ($currpage_url =~ m/start=(\d+)/) { $curr_start = $1; } my $pot_url; foreach $pot_url (@nextpage_urls) { my ($next_start) = ($pot_url =~ m/start=(\d+)/); if ($next_start>$curr_start) { $nextpage_url = $pot_url; last; } } # print "-" x 40, "\n"; my $c = 1; my $p = 1; foreach my $tvu (@thumbimg_urls) { my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/); $img_url =~ s/%25/%/g; my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/); ## print STDERR "****imgref_url = $imgref_url\n"; $imgref_url =~ s/%25/%/g; my ($img_ext) = ($img_url =~ m/\.(\w+)$/); $img_ext = lc($img_ext); # remove http:// if there, so later we can explicitly add it in $img_url =~ s/^http:\/\///; print "Downloading image url http://$img_url\n"; my $output_fname = "$search_term_dir/img_$c.$img_ext"; get_gi_page("http://",$img_url,$output_fname); if (-s $output_fname == 0) { unlink $output_fname; } elsif (system("identify \"$output_fname\"") > 0 ) { print STDERR "**** NOT JPEG: output_fname \n"; unlink $output_fname; } else { my $command = "identify \"$output_fname\" 2>&1"; my $result = `$command`; my $type = 'unknown'; my $width = 'unknown'; my $height = 'unknown'; my $image_safe = quotemeta $output_fname; if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) { $type = $1; $width = $2; $height = $3; } if (($width ne "unknown") && ($height ne "unknown")) { if (($width>200) || ($height>200)) { `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`; `/bin/mv /tmp/x.jpg \"$output_fname\"`; } } $c++; } push(@imgref_urls,$imgref_url); last if ($c==3); # Only take first 2 $p++; if ($p==20) { print STDERR "*** Unable to get enough images after 20 passes\n"; last; } } if (defined $nextpage_url) { print "Next page URL:\n"; print_tags($nextpage_url); } # print "-" x 40, "\n"; } return ($nextpage_url, \@imgref_urls); } sub make_search_term_safe { my ($search_terms) = @_; my $search_term_safe = join("+",@$search_terms); $search_term_safe =~ s/\"/%22/g; $search_term_safe =~ s/ /+/g; return $search_term_safe; } sub gi_query_url { my ($search_term) = @_; my $search_term_safe = make_search_term_safe($search_term); my $nextpage_url = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search"; $nextpage_url .= "\&q=$search_term_safe"; return $nextpage_url; } sub gi_url_base { return "http://images.google.com"; } sub giget { my ($search_terms,$output_dir) = @_; my $imgref_urls = []; if (!-e $output_dir) { mkdir($output_dir); } print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n"; my $gi_base = gi_url_base(); my $nextpage_url = gi_query_url($search_terms); my $respage_fname = "$output_dir/respage1.html"; get_gi_page($gi_base,$nextpage_url,$respage_fname); ($nextpage_url, $imgref_urls) = parse_gi_search_page($gi_base,$output_dir, $respage_fname,$nextpage_url); # else { # print STDERR " Images already mirrored\n"; # } print STDERR "-" x 40, "\n"; return $imgref_urls; } 1;