Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 11 years ago
Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone
File size: 11.5 KB

Rev	Line
[26436]	1	#!/usr/bin/perl -w
	2
	3	use strict;
	4	no strict 'refs'; # allow filehandles to be variables and viceversa
	5
	6	use warnings;
	7
	8	use Encode;
	9	use JSON;
	10
	11	# use LWP;
	12
	13	use OAuth::Lite::Consumer;
	14	use OAuth::Lite::AuthMethod;
	15
	16	use URI::Escape;
	17
	18	sub _data_api
	19	{
	20	my ($mode,$htid,$opt_seq,$opt_params) = @_;
	21
	22	my $access_key = '7e6ee38bae';
	23	my $secret_key = 'e0429c0394385486249b4a230702';
	24
	25	my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
	26
	27	$request_url .= "/$opt_seq" if (defined $opt_seq);
	28
	29	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
	30	'consumer_secret' => $secret_key,
	31	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
	32
	33	my $response = $consumer->request( 'method' => 'GET',
	34	'url' => $request_url,
	35	'params' => $opt_params );
	36
	37	if (!$response->is_success()) {
	38	print STDERR "**** Failed to retrieval any content from URL:\n";
	39	print STDERR " ", $consumer->oauth_request->uri, "\n";
	40	print "------\n";
	41	print STDERR "**** Status: ", $response->status_line, "\n";
	42	print "------\n";
	43	print STDERR "**** Content: ", $response->content, "\n";
	44	print "------\n";
	45
	46	$response = undef;
	47	}
	48
	49	return $response;
	50	}
	51
	52
	53	sub pageimage_data_api
	54	{
	55	my ($htid,$seq_num,$ofilename) = @_;
	56
	57	if (!-f $ofilename) {
	58	print STDERR "Downloading PageImage $htid/$seq_num\n";
	59
	60	my $response = _data_api("pageimage",$htid, $seq_num );
	61	my $content = $response->content();
	62
	63	if (open(IMGOUT,">$ofilename")) {
	64	binmode(IMGOUT);
	65	print IMGOUT $content;
	66	close(IMGOUT);
	67	}
	68	else {
	69	print STDERR "Error: Failed to open $ofilename for binary output\n";
	70	print STDERR " $!\n";
	71	}
	72	}
	73	else {
	74	print STDERR "Skipping PageImage data API request\n";
	75	print STDERR "=> downloaded file $ofilename already exists\n";
	76	}
	77	}
	78
	79
	80
	81	sub pageocr_data_api
	82	{
	83	my ($htid,$seq_num,$ofilename) = @_;
	84
	85	my $content = undef;
	86
	87	if (((defined $ofilename) && (!-f $ofilename))
	88	\|\| (!defined $ofilename)) {
	89	print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
	90
	91	my $response = _data_api("pageocr",$htid, $seq_num );
	92	$content = $response->content();
	93
	94	if (open(TXTOUT,">$ofilename")) {
	95	print TXTOUT $content;
	96	close(TXTOUT);
	97	}
	98	else {
	99	print STDERR "Error: Failed to open $ofilename for binary output\n";
	100	print STDERR " $!\n";
	101	}
	102	}
	103	else {
	104	print STDERR "Skipping PageOCR Data API request\n";
	105	print STDERR "=> Using cached version of file:\n $ofilename\n";
	106
	107	if (open(JSIN,"<$ofilename")) {
	108	binmode(JSIN,":utf8");
	109
	110	my $line;
	111	while (defined ($line=<JSIN>)) {
	112	$content .= $line;
	113	}
	114	close(JSIN);
	115	}
	116	else {
	117	print STDERR "Error: Failed to open cached file $ofilename for input\n";
	118	print STDERR " $!\n";
	119	}
	120	}
	121
	122	return $content;
	123	}
	124
	125	sub json_structure_data_api
	126	{
	127	my ($htid,$ofilename) = @_;
	128
	129	my $json_content = "";
	130
	131	if (!-f $ofilename) {
	132	print STDERR "Downloading METS structure record for $htid\n";
	133
	134	my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
	135	$json_content = $response->content();
	136
	137	if (open(JSOUT,">$ofilename")) {
	138	binmode(JSOUT,":utf8");
	139	print JSOUT $json_content;
	140	close(JSOUT);
	141	}
	142	else {
	143	print STDERR "Error: Failed to open $ofilename for output\n";
	144	print STDERR " $!\n";
	145	}
	146
	147	}
	148	else {
	149	print STDERR "Skipping Structure Data API request\n";
	150	print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
	151
	152	if (open(JSIN,"<$ofilename")) {
	153	binmode(JSIN,":utf8");
	154
	155	my $line;
	156	while (defined ($line=<JSIN>)) {
	157	$json_content .= $line;
	158	}
	159	close(JSIN);
	160	}
	161	else {
	162	print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
	163	print STDERR " $!\n";
	164	}
	165	}
	166
	167	## print "**** $json_content\n";
	168
	169	my $json_content_utf8 = Encode::encode("utf8",$json_content);
	170	my $json_data = decode_json $json_content_utf8;
	171
	172	return $json_data;
	173
	174
	175	}
	176
	177
	178	# Example file
	179
	180	#<PagedDocument>
	181	# <Metadata name="Title">Matariki 1881</Metadata>
	182	# <Metadata name="Date">18810423</Metadata>
	183	# <Metadata name="Number">1</Metadata>
	184	# <PageGroup>
	185	# <Metadata name="Title">Supplementary Material</Metadata>
	186	# <Page txtfile="abstracts/23__1abstract.txt">
	187	# <Metadata name="Title">Abstract</Metadata>
	188	# </Page>
	189	# </PageGroup>
	190	# <PageGroup>
	191	# <Metadata name="Title">Newspaper pages</Metadata>
	192	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
	193	# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
	194	# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
	195	# </PageGroup>
	196	#</PagedDocument>
	197
	198	sub rec_paged_image_structure
	199	{
	200	my ($this_div,$pagenum,$elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
	201
	202	my ($local_output_dir) = ($resource_output_dir =~ m/^.\/(.?)$/);
	203
	204	print PIOUT " " x $depth, "<$elem_name>\n";
	205
	206	my $fptr_entry = $this_div->{'METS:fptr'};
	207
	208	if (defined $fptr_entry) {
	209	# hit a leaf node
	210
	211	my $fptr_array = undef;
	212
	213	if (ref $fptr_entry eq "HASH") {
	214	$fptr_array = [ $fptr_entry ];
	215	}
	216	else {
	217	$fptr_array = $fptr_entry;
	218	}
	219
	220	my $imgfile = undef;
	221	my $txtfile = undef;
	222
	223	foreach my $fptr_hash (@$fptr_array) {
	224	my $fileid = $fptr_hash->{'FILEID'};
	225
	226	## print STDERR "Looking up fileid = $fileid\n";
	227
	228	my $file = $file_id_map->{$fileid};
	229	my $seq = $file->{'SEQ'};
	230	my $href = $file->{'METS:FLocat'}->{'xlink:href'};
	231
	232	if ($file->{'USE'} =~ m/\bimage\b/i) {
	233	$imgfile = "$local_output_dir/$href";
	234	my $full_imgfile = "$resource_output_dir/$href";
	235	pageimage_data_api($htid,$seq,$full_imgfile);
	236	}
	237	elsif ($file->{'USE'} =~ m/\bocr\b/i) {
	238	$txtfile = "$local_output_dir/$href";
	239	my $full_txtfile = "$resource_output_dir/$href";
	240	pageocr_data_api($htid,$seq,$full_txtfile);
	241	}
	242	}
	243	# Generate line along the following lines
	244
	245	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
	246	print PIOUT " " x ($depth+1), "<Page ";
	247	print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
	248	print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
	249	print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
	250	print PIOUT "/>\n";
	251
	252	}
	253
	254	# Now process any child divs
	255
	256	my $div_entry = $this_div->{'METS:div'};
	257
	258	if (defined $div_entry) {
	259
	260	my $div_array = undef;
	261
	262	if (ref $div_entry eq "HASH") {
	263	# upgrade single entry to array
	264	$div_array = [ $div_entry ];
	265	}
	266	else {
	267	$div_array = $div_entry;
	268	}
	269
	270	foreach my $div_hash (@$div_array) {
	271
	272	my $pagenum = $div_hash->{'ORDER'};
	273
	274	rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir);
	275	}
	276	}
	277
	278
	279	print PIOUT " " x $depth, "</$elem_name>\n";
	280
	281	}
	282
	283	sub generate_paged_image_structure
	284	{
	285	my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
	286
	287	print STDERR "Generating PageImage file: $ofilename\n";
	288
	289	my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
	290	if (!-d $resource_output_dir) {
	291	mkdir $resource_output_dir;
	292	}
	293
	294	if (open(PIOUT,">$ofilename")) {
	295	binmode(PIOUT,":utf8");
	296
	297	rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir);
	298
	299	close(PIOUT);
	300	}
	301	else {
	302	print STDERR "Error: Failed to open $ofilename for output\n";
	303	print STDERR " $!\n";
	304	}
	305
	306
	307
	308	}
	309
	310
	311	sub download_ht_doc
	312	{
	313	my ($cat_key,$htid,$ofilename) = @_;
	314
	315	my $json_data = json_structure_data_api($htid,$ofilename);
	316
	317	# Map in the IDs from:
	318	# METS:mets->METS:fileSec->METS:fileGrp
	319
	320	my $file_sec_ids = {};
	321
	322	my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
	323
	324	# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
	325
	326	foreach my $file_grp (@$file_grp_array) {
	327
	328	my $use = $file_grp->{'USE'};
	329
	330	my $file_entry = $file_grp->{'METS:file'};
	331
	332	my $file_array = undef;
	333
	334	if (ref $file_entry eq "HASH") {
	335	# upgrade single entry into array
	336	$file_array = [ $file_entry ];
	337	}
	338	else {
	339	$file_array = $file_entry;
	340	}
	341
	342	# print "**** num files = ", scalar(@$file_array), "\n";
	343
	344	foreach my $file_hash (@$file_array) {
	345	# push file_grp USE attribute down into each file entry (to make file easier later on)
	346	$file_hash->{'USE'} = $use;
	347
	348	my $file_id = $file_hash->{'ID'};
	349	$file_sec_ids->{$file_id} = $file_hash;
	350
	351	# print "file id = $file_id\n";
	352	}
	353
	354	}
	355
	356	# METS:mets->METS:structMap->{nested METS:div}+
	357
	358	my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
	359	my $toplevel_div = $struct_map_array->{'METS:div'};
	360
	361	my $pi_filename = $ofilename;
	362	$pi_filename =~ s/_structure\.json$/_item.xml/;
	363
	364	generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
	365
	366
	367	## print "**** json_content = $json_content_utf8\n\n";
	368
	369	exit 0;
	370
	371	}
	372
	373	sub read_json_file
	374	{
	375	my ($filename) = @_;
	376
	377	print STDERR "+ Proccessing file: $filename\n";
	378
	379	my $json_file_content = "";
	380	open(JSON_FILE, "<$filename");
	381	binmode(JSON_FILE,":utf8");
	382
	383	my $line;
	384	while (defined ($line=<JSON_FILE>)) {
	385	$json_file_content .= $line;
	386	}
	387
	388	close(JSON_FILE);
	389
	390	my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
	391	my $json_data = decode_json $json_file_content_utf8;
	392
	393	my $record_hash = $json_data->{'records'};
	394	my @record_keys = keys %$record_hash;
	395	my $primary_cat_key = shift @record_keys;
	396
	397	my $items_array = $json_data->{'items'};
	398	my $num_items = scalar(@$items_array);
	399
	400	my $num_pd = 0;
	401
	402	foreach my $item (@$items_array) {
	403
	404	my $htid = $item->{'htid'};
	405	my $rights_code = $item->{'rightsCode'};
	406
	407	# print "htid = $htid\n";
	408	# print "Rights code = $rights_code\n" if defined $rights_code;
	409
	410	if (defined($rights_code) && ($rights_code eq "pd")) {
	411	# in the public domain
	412	$num_pd++;
	413
	414	my $htid_safe = uri_escape($htid);
	415
	416	my $ofilename = $filename;
	417	$ofilename =~ s/\.json/_structure.json/;
	418
	419	download_ht_doc($primary_cat_key,$htid,$ofilename);
	420
	421	# bail out at first public domain version of document
	422	last;
	423	}
	424	}
	425
	426	# if ($num_pd==0) {
	427	# print "++ $num_items item(s)\n";
	428	# }
	429	# else {
	430	# print "++ $num_items item(s) of which $num_pd is/are in the public domain\n";
	431	# }
	432
	433	}
	434
	435
	436	sub process_dir
	437	{
	438	my ($full_dir) = @_;
	439
	440	# print "Processing directory: $full_dir\n";
	441
	442	if (opendir(DIN, $full_dir)) {
	443	my @dir_content = grep { $_ !~ m/^\./ } readdir(DIN);
	444	closedir DIN;
	445
	446	foreach my $df (@dir_content) {
	447	my $full_df = "$full_dir/$df";
	448	if (-d $full_df) {
	449	my $full_sub_dir = $full_df;
	450	process_dir($full_sub_dir);
	451	}
	452	else {
	453	# file
	454	my $full_file = $full_df;
	455	if ($full_file =~ m/\.json$/) {
	456	read_json_file($full_file);
	457	}
	458	}
	459	}
	460
	461	}
	462	else {
	463
	464	print STDERR "Error: Failed to open directory: $full_dir\n";
	465	print STDERR " $!\n";
	466	}
	467
	468	}
	469
	470
	471	sub main
	472	{
	473	my ($argv_ref) = @_;
	474
	475	my $toplevel_dir = shift @$argv_ref \|\| "output";
	476
	477
	478	$toplevel_dir =~ s/\/$//; # remove any trailing /
	479
	480	process_dir($toplevel_dir);
	481
	482	}
	483
	484	main(\@ARGV);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: