Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

HathitrustCatalogDownload.pm@ 26443

Last change on this file since 26443 was 26443, checked in by davidb, 11 years ago
Getting the ball rolling with a clone of the OAIDownload.pm module
File size: 21.8 KB

Line
1	###########################################################################
2	#
3	# HathitrustCatalogDownload.pm -- download content from Hathitrust Catalog
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26
27	# page-scrape to download content from the Hathistrust catalog
28
29
30	package HathitrustCatalogDownload;
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39
40	use WgetDownload;
41	use XMLParser;
42
43	use POSIX qw(tmpnam);
44	use util;
45
46
47	use LWP;
48
49	use OAuth::Lite::Consumer;
50	use OAuth::Lite::AuthMethod;
51
52	use WWW::Mechanize;
53
54
55	sub BEGIN {
56	@HathitrustCatalogDownload::ISA = ('WgetDownload');
57	}
58
59	my $arguments =
60	[ { 'name' => "url",
61	'disp' => "{HathitrustCatalogDownload.url_disp}",
62	'desc' => "{HathitrustCatalogDownload.url}",
63	'type' => "string",
64	'reqd' => "yes"},
65	{ 'name' => "metadata_prefix",
66	'disp' => "{HathitrustCatalogDownload.metadata_prefix_disp}",
67	'desc' => "{HathitrustCatalogDownload.metadata_prefix}",
68	'type' => "string",
69	'deft' => "oai_dc",
70	'reqd' => "no"},
71	{ 'name' => "set",
72	'disp' => "{HathitrustCatalogDownload.set_disp}",
73	'desc' => "{HathitrustCatalogDownload.set}",
74	'type' => "string",
75	'reqd' => "no"},
76	{ 'name' => "get_doc",
77	'disp' => "{HathitrustCatalogDownload.get_doc_disp}",
78	'desc' => "{HathitrustCatalogDownload.get_doc}",
79	'type' => "flag",
80	'reqd' => "no"},
81	{ 'name' => "get_doc_exts",
82	'disp' => "{HathitrustCatalogDownload.get_doc_exts_disp}",
83	'desc' => "{HathitrustCatalogDownload.get_doc_exts}",
84	'type' => "string",
85	'deft' => "doc,pdf,ppt",
86	'reqd' => "no"},
87	{ 'name' => "max_records",
88	'disp' => "{HathitrustCatalogDownload.max_records_disp}",
89	'desc' => "{HathitrustCatalogDownload.max_records}",
90	'type' => "int",
91	'range' => "1,",
92	'reqd' => "no"} ];
93
94	my $options = { 'name' => "HathitrustCatalogDownload",
95	'desc' => "{HathitrustCatalogDownload.desc}",
96	'abstract' => "no",
97	'inherits' => "yes",
98	'args' => $arguments };
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($getlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$getlist, $class);
105
106	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107	push(@{$hashArgOptLists->{"OptList"}},$options);
108
109	my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
110
111	if ($self->{'info_only'}) {
112	# don't worry about any options etc
113	return bless $self, $class;
114	}
115
116	my $parser = new XML::Parser('Style' => 'Stream',
117	'PluginObj' => $self,
118	'Handlers' => {'Char' => \&Char,
119	'Start' => \&OAI_StartTag,
120	'End' => \&OAI_EndTag
121	});
122	$self->{'parser'} = $parser;
123
124	# make sure the tmp directory that we will use later exists
125	my $tmp_dir = "$ENV{GSDLHOME}/tmp";
126	if (! -e $tmp_dir) {
127	&util::mk_dir($tmp_dir);
128	}
129
130	# if max_records not specified, parsing will have set it to ""
131	undef $self->{'max_records'} if $self->{'max_records'} eq "";
132
133	# set up hashmap for individual items in get_doc_exts
134	# to make testing for matches easier
135
136	$self->{'lookup_exts'} = {};
137	my $get_doc_exts = $self->{'get_doc_exts'};
138
139	if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
140	my @exts = split(/,\s*/,$get_doc_exts);
141	foreach my $e (@exts) {
142	$self->{'lookup_exts'}->{lc($e)} = 1;
143	}
144	}
145
146
147	return bless $self, $class;
148	}
149
150	sub download
151	{
152	my ($self) = shift (@_);
153	my ($hashGeneralOptions) = @_;
154
155	## my $cmdWget = $strWgetOptions;
156
157	my $strOutputDir ="";
158	$strOutputDir = $hashGeneralOptions->{"cache_dir"};
159	my $strBasURL = $self->{'url'};
160	my $blnDownloadDoc = $self->{'get_doc'};
161
162	print STDERR "<<Defined Maximum>>\n";
163
164	my $strIDs = $self->getOAIIDs($strBasURL);
165
166	if($strIDs eq "")
167	{
168	print STDERR "Error: No IDs found\n";
169	return 0;
170	}
171
172	my $aryIDs = $self->parseOAIIDs($strIDs);
173	my $intIDs = 0;
174	if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
175	{
176	$intIDs = $self->{'max_records'};
177	}
178	else
179	{
180	$intIDs = scalar(@$aryIDs);
181	}
182	print STDERR "<<Total number of record(s):$intIDs>>\n";
183
184	$self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
185
186	# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
187	# &util::rm($tmp_file);
188
189	return 1;
190	}
191
192	sub getOAIIDs
193	{
194	my ($self,$strBasURL) = @_;
195	## my ($cmdWget);
196
197	my $wgetOptions = $self->getWgetOptions();
198
199	my $cmdWget = $wgetOptions;
200
201	print STDERR "Gathering OAI identifiers.....\n";
202
203	my $metadata_prefix = $self->{'metadata_prefix'};
204	$cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
205
206	# if $set specified, add it in to URL
207	my $set = $self->{'set'};
208	$cmdWget .= "&set=$set" if ($set ne "");
209
210	$cmdWget .= "\" ";
211
212	my $accumulated_strIDs = "";
213	my $strIDs = $self->useWget($cmdWget);
214
215	if (!defined $strIDs or $strIDs eq "" ){
216	print STDERR "Server information is unavailable.\n";
217	print STDERR "<<Finished>>\n";
218	return;
219	}
220	if ($self->{'forced_quit'}) {
221	return $strIDs;
222	}
223
224	print STDERR "<<Download Information>>\n";
225
226	$self->parse_xml($strIDs);
227
228	$accumulated_strIDs = $strIDs;
229	my $max_recs = $self->{'max_records'};
230	while ($strIDs =~ m/<resumptionToken.?>\s(.?)\s<\/resumptionToken>/s) {
231	# top up list with further requests for IDs
232
233	my $resumption_token = $1;
234
235	$cmdWget = $wgetOptions;
236
237	$cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
238
239	$strIDs = $self->useWget($cmdWget);
240	if ($self->{'forced_quit'}) {
241	return $accumulated_strIDs;
242	}
243
244	$self->parse_xml($strIDs);
245
246	$accumulated_strIDs .= $strIDs;
247
248	my @accumulated_identifiers
249	= ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
250
251	my $num_acc_identifiers = scalar(@accumulated_identifiers);
252	if (defined $max_recs && $num_acc_identifiers > $max_recs ) {
253	last;
254	}
255	}
256
257	return $accumulated_strIDs;
258	}
259
260	sub parseOAIIDs
261	{
262	my ($self,$strIDs) = @_;
263
264	print STDERR "Parsing OAI identifiers.....\n";
265	$strIDs =~ s/^.*?<identifier>/<identifier>/s;
266	$strIDs =~ s/^(.<\/identifier>).$/$1/s;
267
268	my @aryIDs = ();
269
270	while ($strIDs =~ m/<identifier>(.?)<\/identifier>(.)$/s)
271	{
272	$strIDs = $2;
273	push(@aryIDs,$1);
274	}
275
276	return \@aryIDs;
277	}
278
279	sub dirFileSplit
280	{
281	my ($self,$strFile) = @_;
282
283	my @aryDirs = split("[/\]",$strFile);
284
285	my $strLocalFile = pop(@aryDirs);
286	my $strSubDirs = join("/",@aryDirs);
287
288	return ($strSubDirs,$strLocalFile);
289	}
290
291	sub getOAIDoc
292	{
293	my ($self,$strRecord, $oai_rec_filename) = @_;
294
295	print STDERR "Gathering source documents.....\n";
296	# look out for identifier tag in metadata section
297
298	if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
299	{
300	my $strMetaTag = $1;
301	my $had_valid_url = 0;
302	my $count = 1;
303	while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
304	{
305	my $doc_id_url = $2;
306	print STDERR "Found doc url: $doc_id_url\n";
307	next if ($doc_id_url !~ m/^(https?\|ftp):\/\//);
308
309	my $orig_doc_id_url = $doc_id_url;
310	$had_valid_url = 1;
311
312	my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.)\/(.?)$/);
313	my $faked_ext = 0;
314	my $primary_doc_match = 0;
315
316	my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
317
318	if (defined $id_file_ext) {
319	# cross-check this filename extension with get_doc_exts option
320	# if provided
321	my $lookup_exts = $self->{'lookup_exts'};
322
323	if (defined $lookup_exts->{lc($id_file_ext)}) {
324	# this initial URL matches requirement
325	$primary_doc_match = 1;
326	}
327	}
328	else {
329	$faked_ext = 1;
330	$id_file_ext = "html";
331	}
332
333
334	if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
335	# Download this doc if HTML, scan through it looking for a link
336	# that does match get_doc_exts
337
338
339	# 1. Generate a tmp name
340	my $tmp_filename = &util::get_tmp_filename();
341
342	# 2. Download it
343	my $wget_opts2 = $self->getWgetOptions();
344	my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
345	my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
346	return $strRecord if $self->{'forced_quit'};
347
348	if($error2 ne "")
349	{
350	print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
351	#exit(-1);
352	next;
353	}
354
355	if (defined $follow2) {
356	# src url was "redirected" to another place
357	# => pick up on this and make it the new doc_id_url
358	$doc_id_url = $follow2;
359	}
360
361	my $primary_doc_html = "";
362	if (open(HIN,"<$tmp_filename")) {
363	my $line;
364	while (defined ($line = <HIN>)) {
365	$primary_doc_html .= $line;
366	}
367	close(HIN);
368
369	# 3. Scan through it looking for match
370	#
371	# if got match, change $doc_id_url to this new URL and
372	# $id_file_ext to 'match'
373
374	my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
375
376	my $lookup_exts = $self->{'lookup_exts'};
377
378	foreach my $href (@href_links) {
379	my ($ext) = ($href =~ m/\.([^\.]+)$/);
380
381	if ((defined $ext) && (defined $lookup_exts->{$ext})) {
382
383	if ($href !~ m/^(https?\|ftp):\/\//) {
384	# link is within current site
385	my ($site_domain) = ($doc_id_url =~ m/^((?:https?\|ftp):\/\/.*?)\//);
386
387	$href = "$site_domain$href";
388	}
389
390	$doc_id_url = $href;
391	$id_file_ext = $ext;
392	last;
393	}
394	}
395	}
396	else {
397	print STDERR "Error occurred while retrieving OAI source documents (2):\n";
398	print STDERR "$!\n";
399	}
400
401	if (-e $tmp_filename) {
402	&util::rm($tmp_filename);
403	}
404	}
405
406	my $download_doc_filename = $oai_rec_filename;
407	my $new_extension = "\-$count\.$id_file_ext";
408	$count++;
409	#$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
410	$download_doc_filename =~ s/\.oai$/$new_extension/;
411	my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
412
413	# may have ' in url - others??
414	my $safe_doc_id_url = $doc_id_url;
415	$safe_doc_id_url =~ s/'/\'/g;
416
417	my $wget_opts = $self->getWgetOptions();
418	my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
419
420	my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
421	return $strRecord if $self->{'forced_quit'};
422
423	if($errors ne "")
424	{
425	print STDERR "Error occured while retriving OAI souce documents (3):\n";
426	print STDERR "$errors\n";
427	#exit(-1);
428	next;
429	}
430
431
432	$strRecord =~ s/<metadata>(.?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
433	}
434
435	if (!$had_valid_url)
436	{
437	print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
438	}
439	}
440	else
441	{
442	print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
443	}
444
445	return $strRecord;
446	}
447
448	sub getOAIRecords
449	{
450	my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
451	my $intDocCounter = 0;
452
453	my $metadata_prefix = $self->{'metadata_prefix'};
454
455	foreach my $strID ( @$aryIDs)
456	{
457	print STDERR "Gathering OAI record with ID $strID.....\n";
458
459	my $wget_opts = $self->getWgetOptions();
460	my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
461
462	my $strRecord = $self->useWget($cmdWget);
463
464	my @fileDirs = split(":",$strID);
465	my $local_id = pop @fileDirs;
466
467	# setup directories
468
469	$strOutputDir =~ s/"//g; #"
470
471	my $host =$self->{'url'};
472
473	$host =~ s@https?:\/\/@@g;
474
475	$host =~ s/:.*//g;
476
477	my $strFileURL = "";
478	if ($strOutputDir ne "") {
479	$strFileURL = "$strOutputDir/";
480	}
481	$strFileURL .= "$host/$local_id.oai";
482
483	# prepare subdirectory for record (if needed)
484	my ($strSubDirPath,$unused) = ("", "");
485
486	($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
487
488	&util::mk_all_dir($strSubDirPath);
489
490	my $ds = &util::get_dirsep();
491
492	if($blnDownloadDoc)
493	{
494	$strRecord = $self->getOAIDoc($strRecord,$strFileURL);
495	}
496
497	# save record
498	open (OAIOUT,">$strFileURL")
499	\|\| die "Unable to save oai metadata record: $!\n";
500	print OAIOUT $strRecord;
501	close(OAIOUT);
502
503	print STDERR "Saving record to $strFileURL\n";
504	print STDERR "<<Done>>\n";
505	$intDocCounter ++;
506	last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
507	}
508
509	(defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
510	print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
511	print STDERR "Complete download meta record from $strBasURL\n";
512
513	print STDERR "<<Finished>>\n";
514	}
515
516	sub url_information
517	{
518	my ($self) = shift (@_);
519	if(!defined $self){ die "System Error: No \$self defined for url_information in HathitrustCatalogDownload\n";}
520
521	my $wgetOptions = $self->getWgetOptions();
522	my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
523
524	my $strIdentify = "verb=Identify";
525	my $strListSets = "verb=ListSets";
526	my $strListMdFormats = "verb=ListMetadataFormats";
527
528	my $strIdentifyCMD = $strBaseCMD;
529	$strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
530
531	my $strIdentifyText = $self->useWget($strIdentifyCMD);
532
533	if (!defined $strIdentifyText or $strIdentifyText eq "" ){
534	print STDERR "Server information is unavailable.\n";
535	print STDERR "<<Finished>>\n";
536	return;
537	}
538
539	print STDERR "General information:\n";
540	$self->parse_xml($strIdentifyText);
541	print STDERR "\n";
542
543	print STDERR "=" x 10, "\n";
544	print STDERR "Metadata Format Information (metadataPrefix):\n";
545	print STDERR "=" x 10, "\n";
546
547	my $strListMdFormatsCMD = $strBaseCMD;
548	$strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
549	my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
550
551	$self->parse_xml($strListMdFormatsText);
552	print STDERR "\n";
553
554	print STDERR "=" x 10, "\n";
555	print STDERR "List Information:\n";
556	print STDERR "=" x 10, "\n";
557
558	my $strListSetCMD = $strBaseCMD;
559	$strListSetCMD =~ s/_OPTS_/$strListSets/;
560	my $strListSetsText = $self->useWget($strListSetCMD);
561
562	$self->parse_xml($strListSetsText);
563	}
564
565	sub parse_xml
566	{
567	my ($self) = shift (@_);
568	my ($xml_text) = @_;
569
570	#### change this to work directly from $xml_text
571
572	#Open a temporary file to store OAI information, and store the information to the temp file
573	my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
574
575	open(*OAIOUT,"> $name");
576
577	print OAIOUT $xml_text;
578	close(OAIOUT);
579
580	$self->{'temp_file_name'} = $name;
581
582	## print STDERR "**** xml text = $xml_text\n";
583
584	eval {
585	$self->{'parser'}->parsefile("$name");
586	##$self->{'parser'}->parse($xml_text);
587	};
588
589	if ($@) {
590	die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
591	##die "OAI: Parsed text is not a well formed XML file ($@)\n";
592	}
593
594	unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
595	}
596
597	####END
598	#{
599	# if($self->{'info'})
600	# {
601	#unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
602	# }
603	#}
604
605	# This Char function overrides the one in XML::Parser::Stream to overcome a
606	# problem where $expat->{Text} is treated as the return value, slowing
607	# things down significantly in some cases.
608	sub Char {
609	use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
610	$_[0]->{'Text'} .= $_[1];
611
612	my $self = $_[0]->{'PluginObj'};
613	if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
614	$self->{'text'} .= $_[1];
615	$self->{'text'} =~ s/[\n]\|([ ]{2,})//g;
616	if($self->{'text'} ne "")
617	{
618	print STDERR " $self->{'subfield'}:($self->{'text'})\n";
619	}
620	}
621	return undef;
622	}
623
624	sub OAI_StartTag
625	{
626	my ($expat, $element, %attr) = @_;
627
628	my $self = $expat->{'PluginObj'};
629	$self->{'subfield'} = $element;
630
631	}
632
633	sub OAI_EndTag
634	{
635	my ($expat, $element) = @_;
636
637	my $self = $expat->{'PluginObj'};
638	$self->{'text'} = "";
639	$self->{'subfield'} = "";
640	}
641
642	sub error
643	{
644	my ($self,$strFunctionName,$strError) = @_;
645	{
646	print "Error occoured in HathitrustCatalogDownload.pm\n".
647	"In Function:".$strFunctionName."\n".
648	"Error Message:".$strError."\n";
649	exit(-1);
650	}
651	}
652
653	#=======
654
655	sub data_api
656	{
657	my ($doc_id) = @_;
658
659	#my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY';
660	#my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET';
661
662	my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY
663	my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET
664
665	#my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver';
666	#my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879";
667	my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12";
668
669
670	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
671	'consumer_secret' => $secret_key,
672	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
673
674	my $response = $consumer->request( 'method' => 'GET',
675	'url' => $request_url,
676	# 'params' => { 'hello' => 'world' }
677	);
678
679	# print CGI::header();
680
681	# print "<p><b>[CLIENT] sent this URL to server:</b><br/>";
682	# print $consumer->oauth_request->uri;
683
684	# print "<p><b>[CLIENT] received this HTTP response from server:</b><br/>";
685	# print $response->status_line;
686
687	if ($response->is_success) {
688	# print "<br/><b>[CLIENT] received this content response from server:</b><blockquote>" .
689	# $response->content . "</blockquote>";
690
691	print "Recieved content:\n";
692	print "------\n";
693
694	print $response->content()
695	}
696	else {
697	print STDERR "**** Failed to retrieval any content from URL:\n";
698	print STDERR " ", $consumer->oauth_request->uri, "\n";
699	print STDERR "**** Status: ", print $response->status_line, "\n";
700
701	}
702
703
704	## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n";
705	}
706
707
708	sub bibliographic_api
709	{
710	my ($catalog_id) = @_;
711
712	my $catalog_json = "$catalog_id.json";
713	my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber";
714	my $url = "$base_url/$catalog_json";
715
716	my $ua = LWP::UserAgent->new();
717	# $ua->agent("Greenstone DL Ingest");
718
719	# make request
720	my $request = HTTP::Request->new(GET => $url);
721
722	# get response
723	my $response = $ua->request($request);
724
725	if ($response->is_success()) {
726
727	my $content_type = $response->content_type();
728
729	my $content = $response->content();
730
731	my $group_by_dir = "output";
732
733	if (!-d $group_by_dir) {
734	print "Creating '$group_by_dir'\n";
735	mkdir($group_by_dir);
736	}
737
738	my @group_by = ($catalog_id =~ m/\d{1,2}/g);
739
740	while (my $next_subdir = shift @group_by) {
741	$group_by_dir .= "/$next_subdir";
742	if (!-d $group_by_dir) {
743	mkdir($group_by_dir);
744	}
745
746	last if (scalar(@group_by)==1);
747	}
748
749	my $ofilename = "$group_by_dir/$catalog_json";
750	if (!-e $ofilename) {
751	if (open(JOUT,">$ofilename")) {
752
753	print JOUT $content;
754	print JOUT "\n";
755	close(JOUT);
756	}
757	else {
758	print STDERR "Error: Failed to open $ofilename\n";
759	print STDERR "!$\n";
760	}
761	}
762	else {
763	print STDOUT "$ofilename already exists. Skipping.\n";
764	}
765	}
766	else
767	{
768	print STDERR "Error: Failed to retrieve $url\n";
769	print STDERR "-----\n";
770	print STDERR "Status line: ", $response->status_line(), "\n";
771	print STDERR " ", $response->content(),"\n";
772	}
773	}
774
775
776
777	sub main
778	{
779
780	my ($argv_ref) = @_;
781
782	my $query=join("+",@$argv_ref) \|\| "zealand";
783
784	my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true";
785	my $url = $base_url . "&lookfor=" . $query;
786
787	my $mech = WWW::Mechanize->new();
788	$mech->get($url);
789
790	my $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
791
792	my $count=0;
793
794	while (defined($next_link)) {
795
796
797	my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/);
798	# my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/,
799	# url_regex => qr/hdl\.handle\.net/);
800	# my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited $search-only$/,
801	# url_regex => qr/hdl\.handle\.net/);
802
803	my $num_catalog_links = scalar(@$catalog_links);
804	# my $num_full_links = scalar(@$full_links);
805	# my $num_restricted_links = scalar(@$restricted_links);
806
807	# print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n";
808
809
810	foreach my $cat_link (@$catalog_links) {
811	my $cat_url = $cat_link->url();
812	my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/);
813	print "cat id = $cat_id\n";
814	bibliographic_api($cat_id);
815	}
816
817
818	$url = $next_link->url();
819
820	$mech->get($url);
821	$next_link = $mech->find_link( text_regex => qr/^Next\s+/);
822
823	$count++;
824
825	## last if ($count==1);
826
827	print "Away to Process link: $url\n";
828
829	}
830
831	}
832
833
834
835	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/hathitrust-downloadfrom/trunk/perllib/downloaders/HathitrustCatalogDownload.pm@ 26443

Download in other formats: