source: gs3-extensions/hathitrust-downloadfrom/trunk/perllib/downloaders/HathitrustCatalogDownload.pm@ 26443

Last change on this file since 26443 was 26443, checked in by davidb, 11 years ago

Getting the ball rolling with a clone of the OAIDownload.pm module

File size: 21.8 KB
Line 
1###########################################################################
2#
3# HathitrustCatalogDownload.pm -- download content from Hathitrust Catalog
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26
27# page-scrape to download content from the Hathistrust catalog
28
29
30package HathitrustCatalogDownload;
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39
40use WgetDownload;
41use XMLParser;
42
43use POSIX qw(tmpnam);
44use util;
45
46
47use LWP;
48
49use OAuth::Lite::Consumer;
50use OAuth::Lite::AuthMethod;
51
52use WWW::Mechanize;
53
54
55sub BEGIN {
56 @HathitrustCatalogDownload::ISA = ('WgetDownload');
57}
58
59my $arguments =
60 [ { 'name' => "url",
61 'disp' => "{HathitrustCatalogDownload.url_disp}",
62 'desc' => "{HathitrustCatalogDownload.url}",
63 'type' => "string",
64 'reqd' => "yes"},
65 { 'name' => "metadata_prefix",
66 'disp' => "{HathitrustCatalogDownload.metadata_prefix_disp}",
67 'desc' => "{HathitrustCatalogDownload.metadata_prefix}",
68 'type' => "string",
69 'deft' => "oai_dc",
70 'reqd' => "no"},
71 { 'name' => "set",
72 'disp' => "{HathitrustCatalogDownload.set_disp}",
73 'desc' => "{HathitrustCatalogDownload.set}",
74 'type' => "string",
75 'reqd' => "no"},
76 { 'name' => "get_doc",
77 'disp' => "{HathitrustCatalogDownload.get_doc_disp}",
78 'desc' => "{HathitrustCatalogDownload.get_doc}",
79 'type' => "flag",
80 'reqd' => "no"},
81 { 'name' => "get_doc_exts",
82 'disp' => "{HathitrustCatalogDownload.get_doc_exts_disp}",
83 'desc' => "{HathitrustCatalogDownload.get_doc_exts}",
84 'type' => "string",
85 'deft' => "doc,pdf,ppt",
86 'reqd' => "no"},
87 { 'name' => "max_records",
88 'disp' => "{HathitrustCatalogDownload.max_records_disp}",
89 'desc' => "{HathitrustCatalogDownload.max_records}",
90 'type' => "int",
91 'range' => "1,",
92 'reqd' => "no"} ];
93
94my $options = { 'name' => "HathitrustCatalogDownload",
95 'desc' => "{HathitrustCatalogDownload.desc}",
96 'abstract' => "no",
97 'inherits' => "yes",
98 'args' => $arguments };
99
100sub new
101{
102 my ($class) = shift (@_);
103 my ($getlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$getlist, $class);
105
106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
108
109 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
110
111 if ($self->{'info_only'}) {
112 # don't worry about any options etc
113 return bless $self, $class;
114 }
115
116 my $parser = new XML::Parser('Style' => 'Stream',
117 'PluginObj' => $self,
118 'Handlers' => {'Char' => \&Char,
119 'Start' => \&OAI_StartTag,
120 'End' => \&OAI_EndTag
121 });
122 $self->{'parser'} = $parser;
123
124 # make sure the tmp directory that we will use later exists
125 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
126 if (! -e $tmp_dir) {
127 &util::mk_dir($tmp_dir);
128 }
129
130 # if max_records not specified, parsing will have set it to ""
131 undef $self->{'max_records'} if $self->{'max_records'} eq "";
132
133 # set up hashmap for individual items in get_doc_exts
134 # to make testing for matches easier
135
136 $self->{'lookup_exts'} = {};
137 my $get_doc_exts = $self->{'get_doc_exts'};
138
139 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
140 my @exts = split(/,\s*/,$get_doc_exts);
141 foreach my $e (@exts) {
142 $self->{'lookup_exts'}->{lc($e)} = 1;
143 }
144 }
145
146
147 return bless $self, $class;
148}
149
150sub download
151{
152 my ($self) = shift (@_);
153 my ($hashGeneralOptions) = @_;
154
155## my $cmdWget = $strWgetOptions;
156
157 my $strOutputDir ="";
158 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
159 my $strBasURL = $self->{'url'};
160 my $blnDownloadDoc = $self->{'get_doc'};
161
162 print STDERR "<<Defined Maximum>>\n";
163
164 my $strIDs = $self->getOAIIDs($strBasURL);
165
166 if($strIDs eq "")
167 {
168 print STDERR "Error: No IDs found\n";
169 return 0;
170 }
171
172 my $aryIDs = $self->parseOAIIDs($strIDs);
173 my $intIDs = 0;
174 if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
175 {
176 $intIDs = $self->{'max_records'};
177 }
178 else
179 {
180 $intIDs = scalar(@$aryIDs);
181 }
182 print STDERR "<<Total number of record(s):$intIDs>>\n";
183
184 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
185
186# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
187# &util::rm($tmp_file);
188
189 return 1;
190}
191
192sub getOAIIDs
193{
194 my ($self,$strBasURL) = @_;
195## my ($cmdWget);
196
197 my $wgetOptions = $self->getWgetOptions();
198
199 my $cmdWget = $wgetOptions;
200
201 print STDERR "Gathering OAI identifiers.....\n";
202
203 my $metadata_prefix = $self->{'metadata_prefix'};
204 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
205
206 # if $set specified, add it in to URL
207 my $set = $self->{'set'};
208 $cmdWget .= "&set=$set" if ($set ne "");
209
210 $cmdWget .= "\" ";
211
212 my $accumulated_strIDs = "";
213 my $strIDs = $self->useWget($cmdWget);
214
215 if (!defined $strIDs or $strIDs eq "" ){
216 print STDERR "Server information is unavailable.\n";
217 print STDERR "<<Finished>>\n";
218 return;
219 }
220 if ($self->{'forced_quit'}) {
221 return $strIDs;
222 }
223
224 print STDERR "<<Download Information>>\n";
225
226 $self->parse_xml($strIDs);
227
228 $accumulated_strIDs = $strIDs;
229 my $max_recs = $self->{'max_records'};
230 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
231 # top up list with further requests for IDs
232
233 my $resumption_token = $1;
234
235 $cmdWget = $wgetOptions;
236
237 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
238
239 $strIDs = $self->useWget($cmdWget);
240 if ($self->{'forced_quit'}) {
241 return $accumulated_strIDs;
242 }
243
244 $self->parse_xml($strIDs);
245
246 $accumulated_strIDs .= $strIDs;
247
248 my @accumulated_identifiers
249 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
250
251 my $num_acc_identifiers = scalar(@accumulated_identifiers);
252 if (defined $max_recs && $num_acc_identifiers > $max_recs ) {
253 last;
254 }
255 }
256
257 return $accumulated_strIDs;
258}
259
260sub parseOAIIDs
261{
262 my ($self,$strIDs) = @_;
263
264 print STDERR "Parsing OAI identifiers.....\n";
265 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
266 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
267
268 my @aryIDs = ();
269
270 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
271 {
272 $strIDs = $2;
273 push(@aryIDs,$1);
274 }
275
276 return \@aryIDs;
277}
278
279sub dirFileSplit
280{
281 my ($self,$strFile) = @_;
282
283 my @aryDirs = split("[/\]",$strFile);
284
285 my $strLocalFile = pop(@aryDirs);
286 my $strSubDirs = join("/",@aryDirs);
287
288 return ($strSubDirs,$strLocalFile);
289}
290
291sub getOAIDoc
292{
293 my ($self,$strRecord, $oai_rec_filename) = @_;
294
295 print STDERR "Gathering source documents.....\n";
296 # look out for identifier tag in metadata section
297
298 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
299 {
300 my $strMetaTag = $1;
301 my $had_valid_url = 0;
302 my $count = 1;
303 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
304 {
305 my $doc_id_url = $2;
306 print STDERR "Found doc url: $doc_id_url\n";
307 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
308
309 my $orig_doc_id_url = $doc_id_url;
310 $had_valid_url = 1;
311
312 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
313 my $faked_ext = 0;
314 my $primary_doc_match = 0;
315
316 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
317
318 if (defined $id_file_ext) {
319 # cross-check this filename extension with get_doc_exts option
320 # if provided
321 my $lookup_exts = $self->{'lookup_exts'};
322
323 if (defined $lookup_exts->{lc($id_file_ext)}) {
324 # this initial URL matches requirement
325 $primary_doc_match = 1;
326 }
327 }
328 else {
329 $faked_ext = 1;
330 $id_file_ext = "html";
331 }
332
333
334 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
335 # Download this doc if HTML, scan through it looking for a link
336 # that does match get_doc_exts
337
338
339 # 1. Generate a tmp name
340 my $tmp_filename = &util::get_tmp_filename();
341
342 # 2. Download it
343 my $wget_opts2 = $self->getWgetOptions();
344 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
345 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
346 return $strRecord if $self->{'forced_quit'};
347
348 if($error2 ne "")
349 {
350 print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
351 #exit(-1);
352 next;
353 }
354
355 if (defined $follow2) {
356 # src url was "redirected" to another place
357 # => pick up on this and make it the new doc_id_url
358 $doc_id_url = $follow2;
359 }
360
361 my $primary_doc_html = "";
362 if (open(HIN,"<$tmp_filename")) {
363 my $line;
364 while (defined ($line = <HIN>)) {
365 $primary_doc_html .= $line;
366 }
367 close(HIN);
368
369 # 3. Scan through it looking for match
370 #
371 # if got match, change $doc_id_url to this new URL and
372 # $id_file_ext to 'match'
373
374 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
375
376 my $lookup_exts = $self->{'lookup_exts'};
377
378 foreach my $href (@href_links) {
379 my ($ext) = ($href =~ m/\.([^\.]+)$/);
380
381 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
382
383 if ($href !~ m/^(https?|ftp):\/\//) {
384 # link is within current site
385 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
386
387 $href = "$site_domain$href";
388 }
389
390 $doc_id_url = $href;
391 $id_file_ext = $ext;
392 last;
393 }
394 }
395 }
396 else {
397 print STDERR "Error occurred while retrieving OAI source documents (2):\n";
398 print STDERR "$!\n";
399 }
400
401 if (-e $tmp_filename) {
402 &util::rm($tmp_filename);
403 }
404 }
405
406 my $download_doc_filename = $oai_rec_filename;
407 my $new_extension = "\-$count\.$id_file_ext";
408 $count++;
409 #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
410 $download_doc_filename =~ s/\.oai$/$new_extension/;
411 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
412
413 # may have &apos; in url - others??
414 my $safe_doc_id_url = $doc_id_url;
415 $safe_doc_id_url =~ s/&apos;/\'/g;
416
417 my $wget_opts = $self->getWgetOptions();
418 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
419
420 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
421 return $strRecord if $self->{'forced_quit'};
422
423 if($errors ne "")
424 {
425 print STDERR "Error occured while retriving OAI souce documents (3):\n";
426 print STDERR "$errors\n";
427 #exit(-1);
428 next;
429 }
430
431
432 $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
433}
434
435if (!$had_valid_url)
436{
437 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
438}
439}
440 else
441{
442 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
443}
444
445return $strRecord;
446}
447
448sub getOAIRecords
449{
450 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
451 my $intDocCounter = 0;
452
453 my $metadata_prefix = $self->{'metadata_prefix'};
454
455 foreach my $strID ( @$aryIDs)
456 {
457 print STDERR "Gathering OAI record with ID $strID.....\n";
458
459 my $wget_opts = $self->getWgetOptions();
460 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
461
462 my $strRecord = $self->useWget($cmdWget);
463
464 my @fileDirs = split(":",$strID);
465 my $local_id = pop @fileDirs;
466
467 # setup directories
468
469 $strOutputDir =~ s/"//g; #"
470
471 my $host =$self->{'url'};
472
473 $host =~ s@https?:\/\/@@g;
474
475 $host =~ s/:.*//g;
476
477 my $strFileURL = "";
478 if ($strOutputDir ne "") {
479 $strFileURL = "$strOutputDir/";
480 }
481 $strFileURL .= "$host/$local_id.oai";
482
483 # prepare subdirectory for record (if needed)
484 my ($strSubDirPath,$unused) = ("", "");
485
486 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
487
488 &util::mk_all_dir($strSubDirPath);
489
490 my $ds = &util::get_dirsep();
491
492 if($blnDownloadDoc)
493 {
494 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
495 }
496
497 # save record
498 open (OAIOUT,">$strFileURL")
499 || die "Unable to save oai metadata record: $!\n";
500 print OAIOUT $strRecord;
501 close(OAIOUT);
502
503 print STDERR "Saving record to $strFileURL\n";
504 print STDERR "<<Done>>\n";
505 $intDocCounter ++;
506 last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
507 }
508
509 (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
510 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
511 print STDERR "Complete download meta record from $strBasURL\n";
512
513 print STDERR "<<Finished>>\n";
514}
515
516sub url_information
517{
518 my ($self) = shift (@_);
519 if(!defined $self){ die "System Error: No \$self defined for url_information in HathitrustCatalogDownload\n";}
520
521 my $wgetOptions = $self->getWgetOptions();
522 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
523
524 my $strIdentify = "verb=Identify";
525 my $strListSets = "verb=ListSets";
526 my $strListMdFormats = "verb=ListMetadataFormats";
527
528 my $strIdentifyCMD = $strBaseCMD;
529 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
530
531 my $strIdentifyText = $self->useWget($strIdentifyCMD);
532
533 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
534 print STDERR "Server information is unavailable.\n";
535 print STDERR "<<Finished>>\n";
536 return;
537 }
538
539 print STDERR "General information:\n";
540 $self->parse_xml($strIdentifyText);
541 print STDERR "\n";
542
543 print STDERR "=" x 10, "\n";
544 print STDERR "Metadata Format Information (metadataPrefix):\n";
545 print STDERR "=" x 10, "\n";
546
547 my $strListMdFormatsCMD = $strBaseCMD;
548 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
549 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
550
551 $self->parse_xml($strListMdFormatsText);
552 print STDERR "\n";
553
554 print STDERR "=" x 10, "\n";
555 print STDERR "List Information:\n";
556 print STDERR "=" x 10, "\n";
557
558 my $strListSetCMD = $strBaseCMD;
559 $strListSetCMD =~ s/_OPTS_/$strListSets/;
560 my $strListSetsText = $self->useWget($strListSetCMD);
561
562 $self->parse_xml($strListSetsText);
563}
564
565sub parse_xml
566{
567 my ($self) = shift (@_);
568 my ($xml_text) = @_;
569
570 #### change this to work directly from $xml_text
571
572 #Open a temporary file to store OAI information, and store the information to the temp file
573 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
574
575 open(*OAIOUT,"> $name");
576
577 print OAIOUT $xml_text;
578 close(OAIOUT);
579
580 $self->{'temp_file_name'} = $name;
581
582## print STDERR "**** xml text = $xml_text\n";
583
584 eval {
585 $self->{'parser'}->parsefile("$name");
586##$self->{'parser'}->parse($xml_text);
587 };
588
589 if ($@) {
590 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
591##die "OAI: Parsed text is not a well formed XML file ($@)\n";
592 }
593
594 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
595}
596
597####END
598#{
599# if($self->{'info'})
600# {
601#unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
602# }
603#}
604
605# This Char function overrides the one in XML::Parser::Stream to overcome a
606# problem where $expat->{Text} is treated as the return value, slowing
607# things down significantly in some cases.
608sub Char {
609 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
610 $_[0]->{'Text'} .= $_[1];
611
612 my $self = $_[0]->{'PluginObj'};
613 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
614 $self->{'text'} .= $_[1];
615 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
616 if($self->{'text'} ne "")
617 {
618 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
619 }
620 }
621 return undef;
622}
623
624sub OAI_StartTag
625{
626 my ($expat, $element, %attr) = @_;
627
628 my $self = $expat->{'PluginObj'};
629 $self->{'subfield'} = $element;
630
631}
632
633sub OAI_EndTag
634{
635 my ($expat, $element) = @_;
636
637 my $self = $expat->{'PluginObj'};
638 $self->{'text'} = "";
639 $self->{'subfield'} = "";
640}
641
642sub error
643{
644 my ($self,$strFunctionName,$strError) = @_;
645 {
646 print "Error occoured in HathitrustCatalogDownload.pm\n".
647 "In Function:".$strFunctionName."\n".
648 "Error Message:".$strError."\n";
649 exit(-1);
650 }
651}
652
653#=======
654
655sub data_api
656{
657 my ($doc_id) = @_;
658
659 #my $access_key = 'PUBLIC_OAUTH_CONSUMER_KEY';
660 #my $secret_key = 'PUBLIC_OAUTH_CONSUMER_SECRET';
661
662 my $access_key = '7e6ee38bae'; # PUBLIC_OAUTH_CONSUMER_KEY
663 my $secret_key = 'e0429c0394385486249b4a230702'; # PUBLIC_OAUTH_CONSUMER_SECRET
664
665 #my $request_url = 'http://babel.hathitrust.org/cgi/htd/dapiserver';
666 #my $request_url = "http://babel.hathitrust.org/cgi/htd/meta/mdp.39015019203879";
667 my $request_url = "http://babel.hathitrust.org/cgi/htd/pagemeta/mdp.39015000000128/12";
668
669
670 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
671 'consumer_secret' => $secret_key,
672 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
673
674 my $response = $consumer->request( 'method' => 'GET',
675 'url' => $request_url,
676 # 'params' => { 'hello' => 'world' }
677 );
678
679# print CGI::header();
680
681# print "<p><b>[CLIENT] sent this URL to server:</b><br/>";
682# print $consumer->oauth_request->uri;
683
684# print "<p><b>[CLIENT] received this HTTP response from server:</b><br/>";
685# print $response->status_line;
686
687 if ($response->is_success) {
688# print "<br/><b>[CLIENT] received this content response from server:</b><blockquote>" .
689# $response->content . "</blockquote>";
690
691 print "Recieved content:\n";
692 print "------\n";
693
694 print $response->content()
695 }
696 else {
697 print STDERR "**** Failed to retrieval any content from URL:\n";
698 print STDERR " ", $consumer->oauth_request->uri, "\n";
699 print STDERR "**** Status: ", print $response->status_line, "\n";
700
701 }
702
703
704## print STDERR "*****\n ", $consumer->oauth_request->uri, "\n";
705}
706
707
708sub bibliographic_api
709{
710 my ($catalog_id) = @_;
711
712 my $catalog_json = "$catalog_id.json";
713 my $base_url = "http://catalog.hathitrust.org/api/volumes/full/recordnumber";
714 my $url = "$base_url/$catalog_json";
715
716 my $ua = LWP::UserAgent->new();
717# $ua->agent("Greenstone DL Ingest");
718
719 # make request
720 my $request = HTTP::Request->new(GET => $url);
721
722 # get response
723 my $response = $ua->request($request);
724
725 if ($response->is_success()) {
726
727 my $content_type = $response->content_type();
728
729 my $content = $response->content();
730
731 my $group_by_dir = "output";
732
733 if (!-d $group_by_dir) {
734 print "Creating '$group_by_dir'\n";
735 mkdir($group_by_dir);
736 }
737
738 my @group_by = ($catalog_id =~ m/\d{1,2}/g);
739
740 while (my $next_subdir = shift @group_by) {
741 $group_by_dir .= "/$next_subdir";
742 if (!-d $group_by_dir) {
743 mkdir($group_by_dir);
744 }
745
746 last if (scalar(@group_by)==1);
747 }
748
749 my $ofilename = "$group_by_dir/$catalog_json";
750 if (!-e $ofilename) {
751 if (open(JOUT,">$ofilename")) {
752
753 print JOUT $content;
754 print JOUT "\n";
755 close(JOUT);
756 }
757 else {
758 print STDERR "Error: Failed to open $ofilename\n";
759 print STDERR "!$\n";
760 }
761 }
762 else {
763 print STDOUT "$ofilename already exists. Skipping.\n";
764 }
765 }
766 else
767 {
768 print STDERR "Error: Failed to retrieve $url\n";
769 print STDERR "-----\n";
770 print STDERR "Status line: ", $response->status_line(), "\n";
771 print STDERR " ", $response->content(),"\n";
772 }
773}
774
775
776
777sub main
778{
779
780 my ($argv_ref) = @_;
781
782 my $query=join("+",@$argv_ref) || "zealand";
783
784 my $base_url = "http://catalog.hathitrust.org/Search/Home?checkspelling=true&type=all&submit=&type=all&sethtftonly=true";
785 my $url = $base_url . "&lookfor=" . $query;
786
787 my $mech = WWW::Mechanize->new();
788 $mech->get($url);
789
790 my $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
791
792 my $count=0;
793
794 while (defined($next_link)) {
795
796
797 my $catalog_links = $mech->find_all_links(text_regex => qr/^Catalog Record\s*/);
798# my $full_links = $mech->find_all_links(text_regex => qr/^Full view\s*$/,
799# url_regex => qr/hdl\.handle\.net/);
800# my $restricted_links = $mech->find_all_links(text_regex => qr/^Limited \(search-only\)/,
801# url_regex => qr/hdl\.handle\.net/);
802
803 my $num_catalog_links = scalar(@$catalog_links);
804# my $num_full_links = scalar(@$full_links);
805# my $num_restricted_links = scalar(@$restricted_links);
806
807# print "+++++ num cat links $num_catalog_links: num full = $num_full_links, num restricted = $num_restricted_links\n";
808
809
810 foreach my $cat_link (@$catalog_links) {
811 my $cat_url = $cat_link->url();
812 my ($cat_id) = ($cat_url =~ m/\/([^\/]*)$/);
813 print "cat id = $cat_id\n";
814 bibliographic_api($cat_id);
815 }
816
817
818 $url = $next_link->url();
819
820 $mech->get($url);
821 $next_link = $mech->find_link( text_regex => qr/^Next\s+/);
822
823 $count++;
824
825## last if ($count==1);
826
827 print "Away to Process link: $url\n";
828
829 }
830
831}
832
833
834
8351;
Note: See TracBrowser for help on using the repository browser.