Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

htc-catalog-search.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 11 years ago
Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone
File size: 5.0 KB

Line
1	#!/usr/bin/perl -w
2
3	use strict;
4	use warnings;
5
6	use LWP;
7
8	use OAuth::Lite::Consumer;
9	use OAuth::Lite::AuthMethod;
10
11	use WWW::Mechanize;
12
13	# use CGI;
14
15
16	sub data_api
17	{
18	my ($doc_id) = @_;
19
20	#my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY';
21	#my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET';
22
23	my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY
24	my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET
25
26	#my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver';
27	#my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879";
28	my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12";
29
30
31	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
32	'consumer_secret' => $secret_key,
33	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
34
35	my $response = $consumer->request( 'method' => 'GET',
36	'url' => $request_url,
37	# 'params' => { 'hello' => 'world' }
38	);
39
40	# print CGI::header();
41
42	# print "<p><b>[CLIENT] sent this URL to server:</b><br/>";
43	# print $consumer->oauth_request->uri;
44
45	# print "<p><b>[CLIENT] received this HTTP response from server:</b><br/>";
46	# print $response->status_line;
47
48	if ($response->is_success) {
49	# print "<br/><b>[CLIENT] received this content response from server:</b><blockquote>" .
50	# $response->content . "</blockquote>";
51
52	print "Recieved content:\n";
53	print "------\n";
54
55	print $response->content()
56	}
57	else {
58	print STDERR "**** Failed to retrieval any content from URL:\n";
59	print STDERR " ", $consumer->oauth_request->uri, "\n";
60	print STDERR "**** Status: ", print $response->status_line, "\n";
61
62	}
63
64
65	## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n";
66	}
67
68
69	sub bibliographic_api
70	{
71	my ($catalog_id) = @_;
72
73	my $catalog_json = "$catalog_id.json";
74	my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber";
75	my $url = "$base_url/$catalog_json";
76
77	my $ua = LWP::UserAgent->new();
78	# $ua->agent("Greenstone DL Ingest");
79
80	# make request
81	my $request = HTTP::Request->new(GET => $url);
82
83	# get response
84	my $response = $ua->request($request);
85
86	if ($response->is_success()) {
87
88	my $content_type = $response->content_type();
89
90	my $content = $response->content();
91
92	my $group_by_dir = "output";
93
94	if (!-d $group_by_dir) {
95	print "Creating '$group_by_dir'\n";
96	mkdir($group_by_dir);
97	}
98
99	my @group_by = ($catalog_id =~ m/\d{1,2}/g);
100
101	while (my $next_subdir = shift @group_by) {
102	$group_by_dir .= "/$next_subdir";
103	if (!-d $group_by_dir) {
104	mkdir($group_by_dir);
105	}
106
107	last if (scalar(@group_by)==1);
108	}
109
110	my $ofilename = "$group_by_dir/$catalog_json";
111	if (!-e $ofilename) {
112	if (open(JOUT,">$ofilename")) {
113
114	print JOUT $content;
115	print JOUT "\n";
116	close(JOUT);
117	}
118	else {
119	print STDERR "Error: Failed to open $ofilename\n";
120	print STDERR "!$\n";
121	}
122	}
123	else {
124	print STDOUT "$ofilename already exists. Skipping.\n";
125	}
126	}
127	else
128	{
129	print STDERR "Error: Failed to retrieve $url\n";
130	print STDERR "-----\n";
131	print STDERR "Status line: ", $response->status_line(), "\n";
132	print STDERR " ", $response->content(),"\n";
133	}
134	}
135
136
137
138	sub main
139	{
140
141	my ($argv_ref) = @_;
142
143	my $query=join("+",@$argv_ref) \|\| "zealand";
144
145	my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true";
146	my $url = $base_url . "&lookfor=" . $query;
147
148	my $mech = WWW::Mechanize->new();
149	$mech->get($url);
150
151	my $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
152
153	my $count=0;
154
155	while (defined($next_link)) {
156
157
158	my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/);
159	# my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/,
160	# url_regex => qr/hdl\.handle\.net/);
161	# my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited $search-only$/,
162	# url_regex => qr/hdl\.handle\.net/);
163
164	my $num_catalog_links = scalar(@$catalog_links);
165	# my $num_full_links = scalar(@$full_links);
166	# my $num_restricted_links = scalar(@$restricted_links);
167
168	# print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n";
169
170
171	foreach my $cat_link (@$catalog_links) {
172	my $cat_url = $cat_link->url();
173	my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/);
174	print "cat id = $cat_id\n";
175	bibliographic_api($cat_id);
176	}
177
178
179	$url = $next_link->url();
180
181	$mech->get($url);
182	$next_link = $mech->find_link( text_regex => qr/^Next\s+/);
183
184	$count++;
185
186	## last if ($count==1);
187
188	print "Away to Process link: $url\n";
189
190	}
191
192	}
193
194
195	main(\@ARGV);
196
197
198

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-catalog-search.pl@ 26436

Download in other formats: