#!/usr/bin/perl -w use strict; use warnings; use LWP; use OAuth::Lite::Consumer; use OAuth::Lite::AuthMethod; use WWW::Mechanize; # use CGI; sub data_api { my ($doc_id) = @_; #my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY'; #my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET'; my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET #my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver'; #my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879"; my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12"; my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key, 'consumer_secret' => $secret_key, 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY ); my $response = $consumer->request( 'method' => 'GET', 'url' => $request_url, # 'params' => { 'hello' => 'world' } ); # print CGI::header(); # print "

[CLIENT] sent this URL to server:
"; # print $consumer->oauth_request->uri; # print "

[CLIENT] received this HTTP response from server:
"; # print $response->status_line; if ($response->is_success) { # print "
[CLIENT] received this content response from server:

" . # $response->content . "
"; print "Recieved content:\n"; print "------\n"; print $response->content() } else { print STDERR "**** Failed to retrieval any content from URL:\n"; print STDERR " ", $consumer->oauth_request->uri, "\n"; print STDERR "**** Status: ", print $response->status_line, "\n"; } ## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n"; } sub bibliographic_api { my ($catalog_id) = @_; my $catalog_json = "$catalog_id.json"; my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber"; my $url = "$base_url/$catalog_json"; my $ua = LWP::UserAgent->new(); # $ua->agent("Greenstone DL Ingest"); # make request my $request = HTTP::Request->new(GET => $url); # get response my $response = $ua->request($request); if ($response->is_success()) { my $content_type = $response->content_type(); my $content = $response->content(); my $group_by_dir = "output"; if (!-d $group_by_dir) { print "Creating '$group_by_dir'\n"; mkdir($group_by_dir); } my @group_by = ($catalog_id =~ m/\d{1,2}/g); while (my $next_subdir = shift @group_by) { $group_by_dir .= "/$next_subdir"; if (!-d $group_by_dir) { mkdir($group_by_dir); } last if (scalar(@group_by)==1); } my $ofilename = "$group_by_dir/$catalog_json"; if (!-e $ofilename) { if (open(JOUT,">$ofilename")) { print JOUT $content; print JOUT "\n"; close(JOUT); } else { print STDERR "Error: Failed to open $ofilename\n"; print STDERR "!$\n"; } } else { print STDOUT "$ofilename already exists. Skipping.\n"; } } else { print STDERR "Error: Failed to retrieve $url\n"; print STDERR "-----\n"; print STDERR "Status line: ", $response->status_line(), "\n"; print STDERR " ", $response->content(),"\n"; } } sub main { my ($argv_ref) = @_; my $query=join("+",@$argv_ref) || "zealand"; my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true"; my $url = $base_url . "&lookfor=" . $query; my $mech = WWW::Mechanize->new(); $mech->get($url); my $next_link = $mech->find_link( text_regex => qr/^Next\s+/); my $count=0; while (defined($next_link)) { my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/); # my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/, # url_regex => qr/hdl\.handle\.net/); # my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited \(search-only\)/, # url_regex => qr/hdl\.handle\.net/); my $num_catalog_links = scalar(@$catalog_links); # my $num_full_links = scalar(@$full_links); # my $num_restricted_links = scalar(@$restricted_links); # print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n"; foreach my $cat_link (@$catalog_links) { my $cat_url = $cat_link->url(); my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/); print "cat id = $cat_id\n"; bibliographic_api($cat_id); } $url = $next_link->url(); $mech->get($url); $next_link = $mech->find_link( text_regex => qr/^Next\s+/); $count++; ## last if ($count==1); print "Away to Process link: $url\n"; } } main(\@ARGV);