source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-catalog-search.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 8 years ago

Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone

File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3use strict;
4use warnings;
5
6use LWP;
7
8use OAuth::Lite::Consumer;
9use OAuth::Lite::AuthMethod;
10
11use WWW::Mechanize;
12
13# use CGI;
14
15
16sub data_api
17{
18 my ($doc_id) = @_;
19
20 #my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY';
21 #my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET';
22
23 my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY
24 my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET
25
26 #my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver';
27 #my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879";
28 my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12";
29
30
31 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
32 'consumer_secret' => $secret_key,
33 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
34
35 my $response = $consumer->request( 'method' => 'GET',
36 'url' => $request_url,
37 # 'params' => { 'hello' => 'world' }
38 );
39
40# print CGI::header();
41
42# print "<p><b>[CLIENT] sent this URL to server:</b><br/>";
43# print $consumer->oauth_request->uri;
44
45# print "<p><b>[CLIENT] received this HTTP response from server:</b><br/>";
46# print $response->status_line;
47
48 if ($response->is_success) {
49# print "<br/><b>[CLIENT] received this content response from server:</b><blockquote>" .
50# $response->content . "</blockquote>";
51
52 print "Recieved content:\n";
53 print "------\n";
54
55 print $response->content()
56 }
57 else {
58 print STDERR "**** Failed to retrieval any content from URL:\n";
59 print STDERR " ", $consumer->oauth_request->uri, "\n";
60 print STDERR "**** Status: ", print $response->status_line, "\n";
61
62 }
63
64
65## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n";
66}
67
68
69sub bibliographic_api
70{
71 my ($catalog_id) = @_;
72
73 my $catalog_json = "$catalog_id.json";
74 my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber";
75 my $url = "$base_url/$catalog_json";
76
77 my $ua = LWP::UserAgent->new();
78# $ua->agent("Greenstone DL Ingest");
79
80 # make request
81 my $request = HTTP::Request->new(GET => $url);
82
83 # get response
84 my $response = $ua->request($request);
85
86 if ($response->is_success()) {
87
88 my $content_type = $response->content_type();
89
90 my $content = $response->content();
91
92 my $group_by_dir = "output";
93
94 if (!-d $group_by_dir) {
95 print "Creating '$group_by_dir'\n";
96 mkdir($group_by_dir);
97 }
98
99 my @group_by = ($catalog_id =~ m/\d{1,2}/g);
100
101 while (my $next_subdir = shift @group_by) {
102 $group_by_dir .= "/$next_subdir";
103 if (!-d $group_by_dir) {
104 mkdir($group_by_dir);
105 }
106
107 last if (scalar(@group_by)==1);
108 }
109
110 my $ofilename = "$group_by_dir/$catalog_json";
111 if (!-e $ofilename) {
112 if (open(JOUT,">$ofilename")) {
113
114 print JOUT $content;
115 print JOUT "\n";
116 close(JOUT);
117 }
118 else {
119 print STDERR "Error: Failed to open $ofilename\n";
120 print STDERR "!$\n";
121 }
122 }
123 else {
124 print STDOUT "$ofilename already exists. Skipping.\n";
125 }
126 }
127 else
128 {
129 print STDERR "Error: Failed to retrieve $url\n";
130 print STDERR "-----\n";
131 print STDERR "Status line: ", $response->status_line(), "\n";
132 print STDERR " ", $response->content(),"\n";
133 }
134}
135
136
137
138sub main
139{
140
141 my ($argv_ref) = @_;
142
143 my $query=join("+",@$argv_ref) || "zealand";
144
145 my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true";
146 my $url = $base_url . "&lookfor=" . $query;
147
148 my $mech = WWW::Mechanize->new();
149 $mech->get($url);
150
151 my $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
152
153 my $count=0;
154
155 while (defined($next_link)) {
156
157
158 my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/);
159# my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/,
160# url_regex => qr/hdl\.handle\.net/);
161# my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited \(search-only\)/,
162# url_regex => qr/hdl\.handle\.net/);
163
164 my $num_catalog_links = scalar(@$catalog_links);
165# my $num_full_links = scalar(@$full_links);
166# my $num_restricted_links = scalar(@$restricted_links);
167
168# print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n";
169
170
171 foreach my $cat_link (@$catalog_links) {
172 my $cat_url = $cat_link->url();
173 my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/);
174 print "cat id = $cat_id\n";
175 bibliographic_api($cat_id);
176 }
177
178
179 $url = $next_link->url();
180
181 $mech->get($url);
182 $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
183
184 $count++;
185
186## last if ($count==1);
187
188 print "Away to Process link: $url\n";
189
190 }
191
192}
193
194
195main(\@ARGV);
196
197
198
Note: See TracBrowser for help on using the repository browser.