Context Navigation

htc-get-pd-docs.pl@ 26442

Last change on this file since 26442 was 26442, checked in by davidb, 11 years ago
Further tweaks based on test-runs
File size: 13.2 KB

Rev	Line
[26436]	1	#!/usr/bin/perl -w
	2
	3	use strict;
	4	no strict 'refs'; # allow filehandles to be variables and viceversa
	5
	6	use warnings;
	7
	8	use Encode;
	9	use JSON;
	10
	11	# use LWP;
	12
	13	use OAuth::Lite::Consumer;
	14	use OAuth::Lite::AuthMethod;
	15
	16	use URI::Escape;
	17
	18	sub _data_api
	19	{
	20	my ($mode,$htid,$opt_seq,$opt_params) = @_;
	21
	22	my $access_key = '7e6ee38bae';
	23	my $secret_key = 'e0429c0394385486249b4a230702';
	24
	25	my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
	26
	27	$request_url .= "/$opt_seq" if (defined $opt_seq);
	28
	29	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
	30	'consumer_secret' => $secret_key,
	31	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
	32
	33	my $response = $consumer->request( 'method' => 'GET',
	34	'url' => $request_url,
	35	'params' => $opt_params );
	36
	37	if (!$response->is_success()) {
	38	print STDERR "**** Failed to retrieval any content from URL:\n";
	39	print STDERR " ", $consumer->oauth_request->uri, "\n";
	40	print "------\n";
	41	print STDERR "**** Status: ", $response->status_line, "\n";
	42	print "------\n";
[26442]	43	my $text_only_content = $response->content();
	44	$text_only_content =~ s/<[^>]*>//g;
	45	$text_only_content =~ s/^\s*$//mg;
	46
	47	print STDERR "**** Content: $text_only_content\n";
[26436]	48	print "------\n";
	49
	50	$response = undef;
	51	}
	52
	53	return $response;
	54	}
	55
	56
	57	sub pageimage_data_api
	58	{
	59	my ($htid,$seq_num,$ofilename) = @_;
	60
	61	if (!-f $ofilename) {
	62	print STDERR "Downloading PageImage $htid/$seq_num\n";
	63
[26442]	64	my $retryCount = 0;
	65	PageImageRetry:
[26436]	66	my $response = _data_api("pageimage",$htid, $seq_num );
[26442]	67	if (defined $response) {
	68	$retryCount = 0; # reset it
	69	my $content = $response->content();
[26436]	70
[26442]	71	if (open(IMGOUT,">$ofilename")) {
	72	binmode(IMGOUT);
	73	print IMGOUT $content;
	74	close(IMGOUT);
	75	}
	76	else {
	77	print STDERR "Error: Failed to open $ofilename for binary output\n";
	78	print STDERR " $!\n";
	79	}
[26436]	80	}
	81	else {
[26442]	82	$retryCount++;
	83	print STDERR "Failed to download PageImage\n";
	84
	85	if ($retryCount<2) {
	86	print STDERR "Sleeping to 60 seconds\n";
	87	sleep(60);
	88	print STDERR "Retry attempt $retryCount\n";
	89	goto PageImageRetry;
	90	}
	91	else {
	92	print STDERR "Maximum number of attempts reached. Stopping.\n";
	93	exit -1;
	94	}
	95	}
	96
[26436]	97	}
	98	else {
	99	print STDERR "Skipping PageImage data API request\n";
	100	print STDERR "=> downloaded file $ofilename already exists\n";
	101	}
	102	}
	103
	104
	105
	106	sub pageocr_data_api
	107	{
	108	my ($htid,$seq_num,$ofilename) = @_;
	109
	110	my $content = undef;
	111
	112	if (((defined $ofilename) && (!-f $ofilename))
	113	\|\| (!defined $ofilename)) {
	114	print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
[26442]	115
	116	my $retryCount = 0;
	117	PageOcrRetry:
	118
	119	my $response = _data_api("pageocr",$htid, $seq_num );
	120
	121	if (defined $response) {
	122	$retryCount = 0; # reset it
	123
	124	$content = $response->content();
[26436]	125
[26442]	126	if (open(TXTOUT,">$ofilename")) {
	127	print TXTOUT $content;
	128	close(TXTOUT);
	129	}
	130	else {
	131	print STDERR "Error: Failed to open $ofilename for binary output\n";
	132	print STDERR " $!\n";
	133	}
[26436]	134	}
	135	else {
[26442]	136	$retryCount++;
	137	print STDERR "Failed to download PageOCR\n";
	138
	139	if ($retryCount<2) {
	140	print STDERR "Sleeping to 60 seconds\n";
	141	sleep(60);
	142	print STDERR "Retry attempt $retryCount\n";
	143	goto PageOcrRetry;
	144	}
	145	else {
	146	print STDERR "Maximum number of attempts reached. Stopping.\n";
	147	exit -1;
	148	}
	149	}
	150
[26436]	151	}
	152	else {
	153	print STDERR "Skipping PageOCR Data API request\n";
	154	print STDERR "=> Using cached version of file:\n $ofilename\n";
	155
	156	if (open(JSIN,"<$ofilename")) {
	157	binmode(JSIN,":utf8");
	158
	159	my $line;
	160	while (defined ($line=<JSIN>)) {
	161	$content .= $line;
	162	}
	163	close(JSIN);
	164	}
	165	else {
	166	print STDERR "Error: Failed to open cached file $ofilename for input\n";
	167	print STDERR " $!\n";
	168	}
	169	}
	170
	171	return $content;
	172	}
	173
	174	sub json_structure_data_api
	175	{
	176	my ($htid,$ofilename) = @_;
	177
	178	my $json_content = "";
	179
	180	if (!-f $ofilename) {
	181	print STDERR "Downloading METS structure record for $htid\n";
	182
	183	my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
	184	$json_content = $response->content();
	185
	186	if (open(JSOUT,">$ofilename")) {
	187	binmode(JSOUT,":utf8");
	188	print JSOUT $json_content;
	189	close(JSOUT);
	190	}
	191	else {
	192	print STDERR "Error: Failed to open $ofilename for output\n";
	193	print STDERR " $!\n";
	194	}
	195
	196	}
	197	else {
	198	print STDERR "Skipping Structure Data API request\n";
	199	print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
	200
	201	if (open(JSIN,"<$ofilename")) {
	202	binmode(JSIN,":utf8");
	203
	204	my $line;
	205	while (defined ($line=<JSIN>)) {
	206	$json_content .= $line;
	207	}
	208	close(JSIN);
	209	}
	210	else {
	211	print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
	212	print STDERR " $!\n";
	213	}
	214	}
	215
	216	## print "**** $json_content\n";
	217
	218	my $json_content_utf8 = Encode::encode("utf8",$json_content);
	219	my $json_data = decode_json $json_content_utf8;
	220
	221	return $json_data;
	222
	223
	224	}
	225
	226
	227	# Example file
	228
	229	#<PagedDocument>
	230	# <Metadata name="Title">Matariki 1881</Metadata>
	231	# <Metadata name="Date">18810423</Metadata>
	232	# <Metadata name="Number">1</Metadata>
	233	# <PageGroup>
	234	# <Metadata name="Title">Supplementary Material</Metadata>
	235	# <Page txtfile="abstracts/23__1abstract.txt">
	236	# <Metadata name="Title">Abstract</Metadata>
	237	# </Page>
	238	# </PageGroup>
	239	# <PageGroup>
	240	# <Metadata name="Title">Newspaper pages</Metadata>
	241	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
	242	# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
	243	# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
	244	# </PageGroup>
	245	#</PagedDocument>
	246
	247	sub rec_paged_image_structure
	248	{
[26442]	249	my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
[26436]	250
	251	my ($local_output_dir) = ($resource_output_dir =~ m/^.\/(.?)$/);
	252
	253
	254	my $fptr_entry = $this_div->{'METS:fptr'};
[26442]	255
	256	if (defined $this_div->{'METS:div'}) {
	257	# Only want Greenstones <PageGroup> tag if not a METS leaf div
	258	print PIOUT " " x $depth, "<PageGroup>\n";
	259	}
[26436]	260
	261	if (defined $fptr_entry) {
	262	# hit a leaf node
	263
	264	my $fptr_array = undef;
	265
	266	if (ref $fptr_entry eq "HASH") {
	267	$fptr_array = [ $fptr_entry ];
	268	}
	269	else {
	270	$fptr_array = $fptr_entry;
	271	}
	272
	273	my $imgfile = undef;
	274	my $txtfile = undef;
	275
[26442]	276
[26436]	277	foreach my $fptr_hash (@$fptr_array) {
	278	my $fileid = $fptr_hash->{'FILEID'};
	279
	280	## print STDERR "Looking up fileid = $fileid\n";
	281
	282	my $file = $file_id_map->{$fileid};
	283	my $seq = $file->{'SEQ'};
	284	my $href = $file->{'METS:FLocat'}->{'xlink:href'};
	285
[26442]	286
[26436]	287	if ($file->{'USE'} =~ m/\bimage\b/i) {
	288	$imgfile = "$local_output_dir/$href";
	289	my $full_imgfile = "$resource_output_dir/$href";
	290	pageimage_data_api($htid,$seq,$full_imgfile);
	291	}
	292	elsif ($file->{'USE'} =~ m/\bocr\b/i) {
	293	$txtfile = "$local_output_dir/$href";
	294	my $full_txtfile = "$resource_output_dir/$href";
	295	pageocr_data_api($htid,$seq,$full_txtfile);
	296	}
[26442]	297
[26436]	298	}
	299	# Generate line along the following lines
	300
	301	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
	302	print PIOUT " " x ($depth+1), "<Page ";
	303	print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
	304	print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
	305	print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
	306	print PIOUT "/>\n";
	307
	308	}
	309
	310	# Now process any child divs
	311
	312	my $div_entry = $this_div->{'METS:div'};
	313
	314	if (defined $div_entry) {
	315
	316	my $div_array = undef;
	317
	318	if (ref $div_entry eq "HASH") {
	319	# upgrade single entry to array
	320	$div_array = [ $div_entry ];
	321	}
	322	else {
	323	$div_array = $div_entry;
	324	}
	325
[26442]	326	print STDERR "+ Processing ", scalar(@$div_array), " sections\n";
	327
[26436]	328	foreach my $div_hash (@$div_array) {
	329
	330	my $pagenum = $div_hash->{'ORDER'};
	331
[26442]	332	rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir);
[26436]	333	}
	334	}
	335
[26442]	336	if (defined $this_div->{'METS:div'}) {
	337	# Only want Greenstones <PageGroup> tag if not a METS leaf div
	338	print PIOUT " " x $depth, "</PageGroup>\n";
	339	}
[26436]	340
	341
	342	}
	343
	344	sub generate_paged_image_structure
	345	{
	346	my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
	347
	348	print STDERR "Generating PageImage file: $ofilename\n";
	349
	350	my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
	351	if (!-d $resource_output_dir) {
	352	mkdir $resource_output_dir;
	353	}
	354
	355	if (open(PIOUT,">$ofilename")) {
	356	binmode(PIOUT,":utf8");
[26442]	357
	358	print PIOUT "<PagedDocument>\n";
	359	# print PIOUT " <PageGroup>\n";
[26436]	360
[26442]	361	rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir);
[26436]	362
[26442]	363	# print PIOUT " </PageGroup>\n";
	364	print PIOUT "</PagedDocument>\n";
	365
[26436]	366	close(PIOUT);
	367	}
	368	else {
	369	print STDERR "Error: Failed to open $ofilename for output\n";
	370	print STDERR " $!\n";
	371	}
	372
	373
	374
	375	}
	376
	377
[26442]	378	my $pdCount = 0;
	379
[26436]	380	sub download_ht_doc
	381	{
	382	my ($cat_key,$htid,$ofilename) = @_;
	383
	384	my $json_data = json_structure_data_api($htid,$ofilename);
	385
	386	# Map in the IDs from:
	387	# METS:mets->METS:fileSec->METS:fileGrp
	388
	389	my $file_sec_ids = {};
	390
	391	my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
	392
	393	# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
	394
	395	foreach my $file_grp (@$file_grp_array) {
	396
	397	my $use = $file_grp->{'USE'};
	398
	399	my $file_entry = $file_grp->{'METS:file'};
	400
	401	my $file_array = undef;
	402
	403	if (ref $file_entry eq "HASH") {
	404	# upgrade single entry into array
	405	$file_array = [ $file_entry ];
	406	}
	407	else {
	408	$file_array = $file_entry;
	409	}
	410
	411	# print "**** num files = ", scalar(@$file_array), "\n";
	412
	413	foreach my $file_hash (@$file_array) {
	414	# push file_grp USE attribute down into each file entry (to make file easier later on)
	415	$file_hash->{'USE'} = $use;
	416
	417	my $file_id = $file_hash->{'ID'};
	418	$file_sec_ids->{$file_id} = $file_hash;
	419
	420	# print "file id = $file_id\n";
	421	}
	422
	423	}
	424
	425	# METS:mets->METS:structMap->{nested METS:div}+
	426
	427	my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
	428	my $toplevel_div = $struct_map_array->{'METS:div'};
	429
	430	my $pi_filename = $ofilename;
	431	$pi_filename =~ s/_structure\.json$/_item.xml/;
	432
	433	generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
	434
	435
	436	## print "**** json_content = $json_content_utf8\n\n";
	437
[26442]	438	$pdCount++;
[26436]	439
[26442]	440	# if ($pdCount>5) {
	441	# exit 0;
	442	# }
	443
[26436]	444	}
	445
	446	sub read_json_file
	447	{
	448	my ($filename) = @_;
	449
	450	print STDERR "+ Proccessing file: $filename\n";
	451
	452	my $json_file_content = "";
	453	open(JSON_FILE, "<$filename");
	454	binmode(JSON_FILE,":utf8");
	455
	456	my $line;
	457	while (defined ($line=<JSON_FILE>)) {
	458	$json_file_content .= $line;
	459	}
	460
	461	close(JSON_FILE);
	462
	463	my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
	464	my $json_data = decode_json $json_file_content_utf8;
	465
	466	my $record_hash = $json_data->{'records'};
	467	my @record_keys = keys %$record_hash;
	468	my $primary_cat_key = shift @record_keys;
	469
[26442]	470	my $items_entry = $json_data->{'items'};
	471	my $items_array;
	472
	473	print STDERR "*** ref: ", ref $items_entry, "\n\n";
	474
	475
	476	if (ref $items_entry eq "HASH") {
	477	$items_array = [ $items_entry ];
	478	}
	479	else {
	480	$items_array = $items_entry;
	481	}
	482
[26436]	483	my $num_items = scalar(@$items_array);
	484
	485	my $num_pd = 0;
	486
	487	foreach my $item (@$items_array) {
	488
	489	my $htid = $item->{'htid'};
	490	my $rights_code = $item->{'rightsCode'};
	491
	492	# print "htid = $htid\n";
	493	# print "Rights code = $rights_code\n" if defined $rights_code;
	494
	495	if (defined($rights_code) && ($rights_code eq "pd")) {
	496	# in the public domain
	497	$num_pd++;
	498
	499	my $htid_safe = uri_escape($htid);
	500
	501	my $ofilename = $filename;
	502	$ofilename =~ s/\.json/_structure.json/;
	503
	504	download_ht_doc($primary_cat_key,$htid,$ofilename);
	505
	506	# bail out at first public domain version of document
	507	last;
	508	}
	509	}
	510
	511	# if ($num_pd==0) {
	512	# print "++ $num_items item(s)\n";
	513	# }
	514	# else {
	515	# print "++ $num_items item(s) of which $num_pd is/are in the public domain\n";
	516	# }
	517
	518	}
	519
	520
	521	sub process_dir
	522	{
	523	my ($full_dir) = @_;
	524
	525	# print "Processing directory: $full_dir\n";
	526
	527	if (opendir(DIN, $full_dir)) {
[26442]	528	my @dir_content = grep { $_ !~ m/^\./ } sort readdir(DIN);
[26436]	529	closedir DIN;
	530
	531	foreach my $df (@dir_content) {
	532	my $full_df = "$full_dir/$df";
	533	if (-d $full_df) {
	534	my $full_sub_dir = $full_df;
	535	process_dir($full_sub_dir);
	536	}
	537	else {
	538	# file
	539	my $full_file = $full_df;
	540	if ($full_file =~ m/\.json$/) {
	541	read_json_file($full_file);
	542	}
	543	}
	544	}
	545
	546	}
	547	else {
	548
	549	print STDERR "Error: Failed to open directory: $full_dir\n";
	550	print STDERR " $!\n";
	551	}
	552
	553	}
	554
	555
	556	sub main
	557	{
	558	my ($argv_ref) = @_;
	559
	560	my $toplevel_dir = shift @$argv_ref \|\| "output";
	561
	562
	563	$toplevel_dir =~ s/\/$//; # remove any trailing /
	564
	565	process_dir($toplevel_dir);
	566
	567	}
	568
	569	main(\@ARGV);

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26442

Download in other formats: