source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-catalog-search.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 11 years ago

Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone

File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3use strict;
4use warnings;
5
6use LWP;
7
8use OAuth::Lite::Consumer;
9use OAuth::Lite::AuthMethod;
10
11use WWW::Mechanize;
12
13# use CGI;
14
15
16sub data_api
17{
18 my ($doc_id) = @_;
19
20 #my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY';
21 #my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET';
22
23 my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY
24 my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET
25
26 #my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver';
27 #my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879";
28 my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12";
29
30
31 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
32 'consumer_secret' => $secret_key,
33 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
34
35 my $response = $consumer->request( 'method' => 'GET',
36 'url' => $request_url,
37 # 'params' => { 'hello' => 'world' }
38 );
39
40# print CGI::header();
41
42# print "<p><b>[CLIENT] sent this URL to server:</b><br/>";
43# print $consumer->oauth_request->uri;
44
45# print "<p><b>[CLIENT] received this HTTP response from server:</b><br/>";
46# print $response->status_line;
47
48 if ($response->is_success) {
49# print "<br/><b>[CLIENT] received this content response from server:</b><blockquote>" .
50# $response->content . "</blockquote>";
51
52 print "Recieved content:\n";
53 print "------\n";
54
55 print $response->content()
56 }
57 else {
58 print STDERR "**** Failed to retrieval any content from URL:\n";
59 print STDERR " ", $consumer->oauth_request->uri, "\n";
60 print STDERR "**** Status: ", print $response->status_line, "\n";
61
62 }
63
64
65## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n";
66}
67
68
69sub bibliographic_api
70{
71 my ($catalog_id) = @_;
72
73 my $catalog_json = "$catalog_id.json";
74 my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber";
75 my $url = "$base_url/$catalog_json";
76
77 my $ua = LWP::UserAgent->new();
78# $ua->agent("Greenstone DL Ingest");
79
80 # make request
81 my $request = HTTP::Request->new(GET => $url);
82
83 # get response
84 my $response = $ua->request($request);
85
86 if ($response->is_success()) {
87
88 my $content_type = $response->content_type();
89
90 my $content = $response->content();
91
92 my $group_by_dir = "output";
93
94 if (!-d $group_by_dir) {
95 print "Creating '$group_by_dir'\n";
96 mkdir($group_by_dir);
97 }
98
99 my @group_by = ($catalog_id =~ m/\d{1,2}/g);
100
101 while (my $next_subdir = shift @group_by) {
102 $group_by_dir .= "/$next_subdir";
103 if (!-d $group_by_dir) {
104 mkdir($group_by_dir);
105 }
106
107 last if (scalar(@group_by)==1);
108 }
109
110 my $ofilename = "$group_by_dir/$catalog_json";
111 if (!-e $ofilename) {
112 if (open(JOUT,">$ofilename")) {
113
114 print JOUT $content;
115 print JOUT "\n";
116 close(JOUT);
117 }
118 else {
119 print STDERR "Error: Failed to open $ofilename\n";
120 print STDERR "!$\n";
121 }
122 }
123 else {
124 print STDOUT "$ofilename already exists. Skipping.\n";
125 }
126 }
127 else
128 {
129 print STDERR "Error: Failed to retrieve $url\n";
130 print STDERR "-----\n";
131 print STDERR "Status line: ", $response->status_line(), "\n";
132 print STDERR " ", $response->content(),"\n";
133 }
134}
135
136
137
138sub main
139{
140
141 my ($argv_ref) = @_;
142
143 my $query=join("+",@$argv_ref) || "zealand";
144
145 my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true";
146 my $url = $base_url . "&lookfor=" . $query;
147
148 my $mech = WWW::Mechanize->new();
149 $mech->get($url);
150
151 my $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
152
153 my $count=0;
154
155 while (defined($next_link)) {
156
157
158 my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/);
159# my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/,
160# url_regex => qr/hdl\.handle\.net/);
161# my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited \(search-only\)/,
162# url_regex => qr/hdl\.handle\.net/);
163
164 my $num_catalog_links = scalar(@$catalog_links);
165# my $num_full_links = scalar(@$full_links);
166# my $num_restricted_links = scalar(@$restricted_links);
167
168# print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n";
169
170
171 foreach my $cat_link (@$catalog_links) {
172 my $cat_url = $cat_link->url();
173 my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/);
174 print "cat id = $cat_id\n";
175 bibliographic_api($cat_id);
176 }
177
178
179 $url = $next_link->url();
180
181 $mech->get($url);
182 $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
183
184 $count++;
185
186## last if ($count==1);
187
188 print "Away to Process link: $url\n";
189
190 }
191
192}
193
194
195main(\@ARGV);
196
197
198
Note: See TracBrowser for help on using the repository browser.