root/gsdl/trunk/perllib/giget.pm @ 15889

Revision 15889, 5.2 KB (checked in by mdewsnip, 11 years ago)

Added "use strict", and fixed resulting problems.

  • Property svn:keywords set to Author Date Id Revision
RevLine 
[15889]1use strict;
[7492]2
[15889]3
[7492]4sub readin_html
5{
6    my ($html_fname) = @_;
7
8    open(HIN,"<$html_fname")
9    || die "Unable to open $html_fname: $!\n";
10   
11    my $html_text;
12    my $line;
13    while (defined ($line=<HIN>)) {
14    $html_text .= $line;
15    }
16    close(HIN);
17
18    return $html_text;
19}
20
21sub stripout_anchortags
22{
23    my ($html_text) = @_;
24
25    my @anchor_tags = ($html_text =~ m/(<a\s+.*?>)+/gs);
26
27    return @anchor_tags;
28}
29
30
31sub print_tags
32{
33    my (@tags) = @_;
34
35    my $a;
36    foreach $a ( @tags) {
37    print "$a\n";
38    }
39}
40
41sub filter_tags
42{
43    my ($filter_text,@tags) = @_;
44
45    my @filtered_tags = ();
46
47    my $t;
48    foreach $t (@tags) {
49    if ($t =~ m/$filter_text/x) {
50        push(@filtered_tags,$t);
51    }
52    }
53
54    return @filtered_tags;
55}
56
57sub extract_urls {
58    my (@tags) = @_;
59
60    my @urls = ();
61
62    my $t;
63    foreach $t (@tags) {
64    if ($t =~ m/href=([^ ]+)/i) {
65        my $url = $1;
66        $url =~ s/&amp;/&/g;
67        push(@urls,$url);
68    }
69    }
70
71    return @urls;
72}
73
74sub get_gi_page
75{
76    my ($cgi_base,$cgi_call,$downloadto_fname) = @_;
77
78    my $full_url = "$cgi_base$cgi_call";
79   
80    if ((!-e $downloadto_fname) || (-z $downloadto_fname)) {
81    my $cmd = "wget -nv -T 10 -nc -U \"Mozilla\" -O \"$downloadto_fname\" \"$full_url\"";
[10112]82##  print STDERR "*** wget cmd:\n $cmd\n";
83
[7492]84    `$cmd`;
85    }
86
87    if (-z $downloadto_fname) {
88    print STDERR "Warning: downloaded file 0 bytes!\n";
89    }
90}
91
92
93sub parse_gi_search_page
94{
95    my ($ga_base,$search_term_dir,$downloaded_fname,$currpage_url) = @_;
96
97    my $nextpage_url = undef;
98
99    my @imgref_urls = ();
100
101    my $downloaded_text = readin_html($downloaded_fname);
102    if (defined $downloaded_text) {
103    my @anchor_tags = stripout_anchortags($downloaded_text);
104   
105    my @thumbimg_tags = filter_tags("imgres\\?",@anchor_tags);
106    my @nextpage_tags = filter_tags("images\\?.*?start=\\d+",@anchor_tags);
107   
108    my @thumbimg_urls = extract_urls(@thumbimg_tags);
109    my @nextpage_urls = extract_urls(@nextpage_tags);
110
111    my $curr_start = 0;
112    if ($currpage_url =~ m/start=(\d+)/) {
113        $curr_start = $1;
114    }
115
116    my $pot_url;
117    foreach $pot_url (@nextpage_urls) {
118   
119        my ($next_start) = ($pot_url =~ m/start=(\d+)/);
120        if ($next_start>$curr_start) {
121        $nextpage_url = $pot_url;
122        last;
123        }
124    }
125
126#   print "-" x 40, "\n";
127    my $c = 1;
128    my $p = 1;
129
130    foreach my $tvu (@thumbimg_urls) {
131        my ($img_url) = ($tvu =~ m/imgurl=([^&]*)/);
132        $img_url =~ s/%25/%/g;
133
134        my ($imgref_url) = ($tvu =~ m/imgrefurl=([^&]*)/);
135##      print STDERR "****imgref_url = $imgref_url\n";
136        $imgref_url =~ s/%25/%/g;
137       
138        my ($img_ext) = ($img_url =~ m/\.(\w+)$/);
139        $img_ext = lc($img_ext);
140
[8889]141        # remove http:// if there, so later we can explicitly add it in
142        $img_url =~ s/^http:\/\///;
143
[7492]144        print "Downloading image url http://$img_url\n";
145        my $output_fname = "$search_term_dir/img_$c.$img_ext";
146
147        get_gi_page("http://",$img_url,$output_fname);
148
149        if (-s $output_fname == 0) {
150        unlink $output_fname;
151        }
152        elsif (system("identify \"$output_fname\"") > 0 ) {
153        print STDERR "**** NOT JPEG: output_fname \n";
154        unlink $output_fname;
155        }
156        else {
157
158        my $command = "identify \"$output_fname\" 2>&1";
159        my $result = `$command`;
160
161        my $type =   'unknown';
162        my $width =  'unknown';
163        my $height = 'unknown';
164       
165        my $image_safe = quotemeta $output_fname;
166        if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
167            $type = $1;
168            $width = $2;
169            $height = $3;
170        }
171
172        if (($width ne "unknown") && ($height ne "unknown")) {
173            if (($width>200) || ($height>200)) {
174            `convert \"$output_fname\" -resize 200x200 /tmp/x.jpg`;
175            `/bin/mv /tmp/x.jpg \"$output_fname\"`;
176            }
177        }
178        $c++;
179        }
180
181        push(@imgref_urls,$imgref_url);
182
183        last if ($c==3); # Only take first 2
184
185        $p++;
186
187        if ($p==20) {
188        print STDERR "*** Unable to get enough images after 20 passes\n";
189        last;
190        }
191
192
193    }
194
195    if (defined $nextpage_url) {
196        print "Next page URL:\n";
197        print_tags($nextpage_url);
198    }
199#   print "-" x 40, "\n";
200    }
201
202    return ($nextpage_url, \@imgref_urls);
203}
204
205sub make_search_term_safe
206{
207    my ($search_terms) = @_;
208
209    my $search_term_safe = join("+",@$search_terms);
210    $search_term_safe =~ s/\"/%22/g;
211    $search_term_safe =~ s/ /+/g;
212
213    return $search_term_safe;
214}
215
216sub gi_query_url
217{
218    my ($search_term) = @_;
219
220    my $search_term_safe = make_search_term_safe($search_term);
221
222    my $nextpage_url
223    = "/images?as_filetype=jpg&imgc=color\&ie=UTF-8\&oe=UTF-8\&hl=en\&btnG=Google+Search";
224    $nextpage_url .= "\&q=$search_term_safe";
225
226    return $nextpage_url;
227}
228
229sub gi_url_base
230{
231    return "http://images.google.com";
232}
233
234sub giget
235{
236    my ($search_terms,$output_dir) = @_;
237    my $imgref_urls = [];
238
239    if (!-e $output_dir) {
240    mkdir($output_dir);
241   
242    }
243   
[10112]244    print STDERR "Searching Google Images for: ", join(", ",@$search_terms), "\n";
[7492]245
246    my $gi_base = gi_url_base();
247    my $nextpage_url = gi_query_url($search_terms);
248   
[15889]249    my $respage_fname = "$output_dir/respage1.html";
[7492]250    get_gi_page($gi_base,$nextpage_url,$respage_fname);
251   
252    ($nextpage_url, $imgref_urls)
253    = parse_gi_search_page($gi_base,$output_dir,
254                   $respage_fname,$nextpage_url);
255#    else {
256#   print STDERR "  Images already mirrored\n";
257#    }
258
259    print STDERR "-" x 40, "\n";
260
261    return $imgref_urls;
262}
263
264
2651;
Note: See TracBrowser for help on using the browser.