#!/usr/bin/perl -w use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use warnings; use Encode; use JSON; # use LWP; use OAuth::Lite::Consumer; use OAuth::Lite::AuthMethod; use URI::Escape; sub _data_api { my ($mode,$htid,$opt_seq,$opt_params) = @_; my $access_key = '7e6ee38bae'; my $secret_key = 'e0429c0394385486249b4a230702'; my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid"; $request_url .= "/$opt_seq" if (defined $opt_seq); my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key, 'consumer_secret' => $secret_key, 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY ); my $response = $consumer->request( 'method' => 'GET', 'url' => $request_url, 'params' => $opt_params ); if (!$response->is_success()) { print STDERR "**** Failed to retrieval any content from URL:\n"; print STDERR " ", $consumer->oauth_request->uri, "\n"; print "------\n"; print STDERR "**** Status: ", $response->status_line, "\n"; print "------\n"; my $text_only_content = $response->content(); $text_only_content =~ s/<[^>]*>//g; $text_only_content =~ s/^\s*$//mg; print STDERR "**** Content: $text_only_content\n"; print "------\n"; $response = undef; } return $response; } sub pageimage_data_api { my ($htid,$seq_num,$ofilename) = @_; if (!-f $ofilename) { print STDERR "Downloading PageImage $htid/$seq_num\n"; my $retryCount = 0; PageImageRetry: my $response = _data_api("pageimage",$htid, $seq_num ); if (defined $response) { $retryCount = 0; # reset it my $content = $response->content(); if (open(IMGOUT,">$ofilename")) { binmode(IMGOUT); print IMGOUT $content; close(IMGOUT); } else { print STDERR "Error: Failed to open $ofilename for binary output\n"; print STDERR " $!\n"; } } else { $retryCount++; print STDERR "Failed to download PageImage\n"; if ($retryCount<2) { print STDERR "Sleeping to 60 seconds\n"; sleep(60); print STDERR "Retry attempt $retryCount\n"; goto PageImageRetry; } else { print STDERR "Maximum number of attempts reached. Stopping.\n"; exit -1; } } } else { print STDERR "Skipping PageImage data API request\n"; print STDERR "=> downloaded file $ofilename already exists\n"; } } sub pageocr_data_api { my ($htid,$seq_num,$ofilename) = @_; my $content = undef; if (((defined $ofilename) && (!-f $ofilename)) || (!defined $ofilename)) { print STDERR "Downloading PageOCR (text) $htid/$seq_num\n"; my $retryCount = 0; PageOcrRetry: my $response = _data_api("pageocr",$htid, $seq_num ); if (defined $response) { $retryCount = 0; # reset it $content = $response->content(); if (open(TXTOUT,">$ofilename")) { print TXTOUT $content; close(TXTOUT); } else { print STDERR "Error: Failed to open $ofilename for binary output\n"; print STDERR " $!\n"; } } else { $retryCount++; print STDERR "Failed to download PageOCR\n"; if ($retryCount<2) { print STDERR "Sleeping to 60 seconds\n"; sleep(60); print STDERR "Retry attempt $retryCount\n"; goto PageOcrRetry; } else { print STDERR "Maximum number of attempts reached. Stopping.\n"; exit -1; } } } else { print STDERR "Skipping PageOCR Data API request\n"; print STDERR "=> Using cached version of file:\n $ofilename\n"; if (open(JSIN,"<$ofilename")) { binmode(JSIN,":utf8"); my $line; while (defined ($line=)) { $content .= $line; } close(JSIN); } else { print STDERR "Error: Failed to open cached file $ofilename for input\n"; print STDERR " $!\n"; } } return $content; } sub json_structure_data_api { my ($htid,$ofilename) = @_; my $json_content = ""; if (!-f $ofilename) { print STDERR "Downloading METS structure record for $htid\n"; my $response = _data_api("structure",$htid, undef, {'alt' => "json"} ); $json_content = $response->content(); if (open(JSOUT,">$ofilename")) { binmode(JSOUT,":utf8"); print JSOUT $json_content; close(JSOUT); } else { print STDERR "Error: Failed to open $ofilename for output\n"; print STDERR " $!\n"; } } else { print STDERR "Skipping Structure Data API request\n"; print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n"; if (open(JSIN,"<$ofilename")) { binmode(JSIN,":utf8"); my $line; while (defined ($line=)) { $json_content .= $line; } close(JSIN); } else { print STDERR "Error: Failed to open cached JSON file $ofilename for input\n"; print STDERR " $!\n"; } } ## print "**** $json_content\n"; my $json_content_utf8 = Encode::encode("utf8",$json_content); my $json_data = decode_json $json_content_utf8; return $json_data; } # Example file # # Matariki 1881 # 18810423 # 1 # # Supplementary Material # # Abstract # # # # Newspaper pages # # # # # sub rec_paged_image_structure { my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_; my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/); my $fptr_entry = $this_div->{'METS:fptr'}; if (defined $this_div->{'METS:div'}) { # Only want Greenstones tag if not a METS leaf div print PIOUT " " x $depth, "\n"; } if (defined $fptr_entry) { # hit a leaf node my $fptr_array = undef; if (ref $fptr_entry eq "HASH") { $fptr_array = [ $fptr_entry ]; } else { $fptr_array = $fptr_entry; } my $imgfile = undef; my $txtfile = undef; foreach my $fptr_hash (@$fptr_array) { my $fileid = $fptr_hash->{'FILEID'}; ## print STDERR "Looking up fileid = $fileid\n"; my $file = $file_id_map->{$fileid}; my $seq = $file->{'SEQ'}; my $href = $file->{'METS:FLocat'}->{'xlink:href'}; if ($file->{'USE'} =~ m/\bimage\b/i) { $imgfile = "$local_output_dir/$href"; my $full_imgfile = "$resource_output_dir/$href"; pageimage_data_api($htid,$seq,$full_imgfile); } elsif ($file->{'USE'} =~ m/\bocr\b/i) { $txtfile = "$local_output_dir/$href"; my $full_txtfile = "$resource_output_dir/$href"; pageocr_data_api($htid,$seq,$full_txtfile); } } # Generate line along the following lines # print PIOUT " " x ($depth+1), "\n"; } # Now process any child divs my $div_entry = $this_div->{'METS:div'}; if (defined $div_entry) { my $div_array = undef; if (ref $div_entry eq "HASH") { # upgrade single entry to array $div_array = [ $div_entry ]; } else { $div_array = $div_entry; } print STDERR "+ Processing ", scalar(@$div_array), " sections\n"; foreach my $div_hash (@$div_array) { my $pagenum = $div_hash->{'ORDER'}; rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir); } } if (defined $this_div->{'METS:div'}) { # Only want Greenstones tag if not a METS leaf div print PIOUT " " x $depth, "\n"; } } sub generate_paged_image_structure { my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_; print STDERR "Generating PageImage file: $ofilename\n"; my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/); if (!-d $resource_output_dir) { mkdir $resource_output_dir; } if (open(PIOUT,">$ofilename")) { binmode(PIOUT,":utf8"); print PIOUT "\n"; # print PIOUT " \n"; rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir); # print PIOUT " \n"; print PIOUT "\n"; close(PIOUT); } else { print STDERR "Error: Failed to open $ofilename for output\n"; print STDERR " $!\n"; } } my $pdCount = 0; sub download_ht_doc { my ($cat_key,$htid,$ofilename) = @_; my $json_data = json_structure_data_api($htid,$ofilename); # Map in the IDs from: # METS:mets->METS:fileSec->METS:fileGrp my $file_sec_ids = {}; my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'}; # print "**** num file grps = ", scalar(@$file_grp_array), "\n"; foreach my $file_grp (@$file_grp_array) { my $use = $file_grp->{'USE'}; my $file_entry = $file_grp->{'METS:file'}; my $file_array = undef; if (ref $file_entry eq "HASH") { # upgrade single entry into array $file_array = [ $file_entry ]; } else { $file_array = $file_entry; } # print "**** num files = ", scalar(@$file_array), "\n"; foreach my $file_hash (@$file_array) { # push file_grp USE attribute down into each file entry (to make file easier later on) $file_hash->{'USE'} = $use; my $file_id = $file_hash->{'ID'}; $file_sec_ids->{$file_id} = $file_hash; # print "file id = $file_id\n"; } } # METS:mets->METS:structMap->{nested METS:div}+ my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'}; my $toplevel_div = $struct_map_array->{'METS:div'}; my $pi_filename = $ofilename; $pi_filename =~ s/_structure\.json$/_item.xml/; generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename); ## print "**** json_content = $json_content_utf8\n\n"; $pdCount++; # if ($pdCount>5) { # exit 0; # } } sub read_json_file { my ($filename) = @_; print STDERR "+ Proccessing file: $filename\n"; my $json_file_content = ""; open(JSON_FILE, "<$filename"); binmode(JSON_FILE,":utf8"); my $line; while (defined ($line=)) { $json_file_content .= $line; } close(JSON_FILE); my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content); my $json_data = decode_json $json_file_content_utf8; my $record_hash = $json_data->{'records'}; my @record_keys = keys %$record_hash; my $primary_cat_key = shift @record_keys; my $items_entry = $json_data->{'items'}; my $items_array; print STDERR "*** ref: ", ref $items_entry, "\n\n"; if (ref $items_entry eq "HASH") { $items_array = [ $items_entry ]; } else { $items_array = $items_entry; } my $num_items = scalar(@$items_array); my $num_pd = 0; foreach my $item (@$items_array) { my $htid = $item->{'htid'}; my $rights_code = $item->{'rightsCode'}; # print "htid = $htid\n"; # print "Rights code = $rights_code\n" if defined $rights_code; if (defined($rights_code) && ($rights_code eq "pd")) { # in the public domain $num_pd++; my $htid_safe = uri_escape($htid); my $ofilename = $filename; $ofilename =~ s/\.json/_structure.json/; download_ht_doc($primary_cat_key,$htid,$ofilename); # bail out at first public domain version of document last; } } # if ($num_pd==0) { # print "++ $num_items item(s)\n"; # } # else { # print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n"; # } } sub process_dir { my ($full_dir) = @_; # print "Processing directory: $full_dir\n"; if (opendir(DIN, $full_dir)) { my @dir_content = grep { $_ !~ m/^\./ } sort readdir(DIN); closedir DIN; foreach my $df (@dir_content) { my $full_df = "$full_dir/$df"; if (-d $full_df) { my $full_sub_dir = $full_df; process_dir($full_sub_dir); } else { # file my $full_file = $full_df; if ($full_file =~ m/\.json$/) { read_json_file($full_file); } } } } else { print STDERR "Error: Failed to open directory: $full_dir\n"; print STDERR " $!\n"; } } sub main { my ($argv_ref) = @_; my $toplevel_dir = shift @$argv_ref || "output"; $toplevel_dir =~ s/\/$//; # remove any trailing / process_dir($toplevel_dir); } main(\@ARGV);