Context Navigation

htc-get-pd-docs.pl@ 26442

Last change on this file since 26442 was 26442, checked in by davidb, 11 years ago
Further tweaks based on test-runs
File size: 13.2 KB

Line
1	#!/usr/bin/perl -w
2
3	use strict;
4	no strict 'refs'; # allow filehandles to be variables and viceversa
5
6	use warnings;
7
8	use Encode;
9	use JSON;
10
11	# use LWP;
12
13	use OAuth::Lite::Consumer;
14	use OAuth::Lite::AuthMethod;
15
16	use URI::Escape;
17
18	sub _data_api
19	{
20	my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22	my $access_key = '7e6ee38bae';
23	my $secret_key = 'e0429c0394385486249b4a230702';
24
25	my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27	$request_url .= "/$opt_seq" if (defined $opt_seq);
28
29	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30	'consumer_secret' => $secret_key,
31	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33	my $response = $consumer->request( 'method' => 'GET',
34	'url' => $request_url,
35	'params' => $opt_params );
36
37	if (!$response->is_success()) {
38	print STDERR "**** Failed to retrieval any content from URL:\n";
39	print STDERR " ", $consumer->oauth_request->uri, "\n";
40	print "------\n";
41	print STDERR "**** Status: ", $response->status_line, "\n";
42	print "------\n";
43	my $text_only_content = $response->content();
44	$text_only_content =~ s/<[^>]*>//g;
45	$text_only_content =~ s/^\s*$//mg;
46
47	print STDERR "**** Content: $text_only_content\n";
48	print "------\n";
49
50	$response = undef;
51	}
52
53	return $response;
54	}
55
56
57	sub pageimage_data_api
58	{
59	my ($htid,$seq_num,$ofilename) = @_;
60
61	if (!-f $ofilename) {
62	print STDERR "Downloading PageImage $htid/$seq_num\n";
63
64	my $retryCount = 0;
65	PageImageRetry:
66	my $response = _data_api("pageimage",$htid, $seq_num );
67	if (defined $response) {
68	$retryCount = 0; # reset it
69	my $content = $response->content();
70
71	if (open(IMGOUT,">$ofilename")) {
72	binmode(IMGOUT);
73	print IMGOUT $content;
74	close(IMGOUT);
75	}
76	else {
77	print STDERR "Error: Failed to open $ofilename for binary output\n";
78	print STDERR " $!\n";
79	}
80	}
81	else {
82	$retryCount++;
83	print STDERR "Failed to download PageImage\n";
84
85	if ($retryCount<2) {
86	print STDERR "Sleeping to 60 seconds\n";
87	sleep(60);
88	print STDERR "Retry attempt $retryCount\n";
89	goto PageImageRetry;
90	}
91	else {
92	print STDERR "Maximum number of attempts reached. Stopping.\n";
93	exit -1;
94	}
95	}
96
97	}
98	else {
99	print STDERR "Skipping PageImage data API request\n";
100	print STDERR "=> downloaded file $ofilename already exists\n";
101	}
102	}
103
104
105
106	sub pageocr_data_api
107	{
108	my ($htid,$seq_num,$ofilename) = @_;
109
110	my $content = undef;
111
112	if (((defined $ofilename) && (!-f $ofilename))
113	\|\| (!defined $ofilename)) {
114	print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
115
116	my $retryCount = 0;
117	PageOcrRetry:
118
119	my $response = _data_api("pageocr",$htid, $seq_num );
120
121	if (defined $response) {
122	$retryCount = 0; # reset it
123
124	$content = $response->content();
125
126	if (open(TXTOUT,">$ofilename")) {
127	print TXTOUT $content;
128	close(TXTOUT);
129	}
130	else {
131	print STDERR "Error: Failed to open $ofilename for binary output\n";
132	print STDERR " $!\n";
133	}
134	}
135	else {
136	$retryCount++;
137	print STDERR "Failed to download PageOCR\n";
138
139	if ($retryCount<2) {
140	print STDERR "Sleeping to 60 seconds\n";
141	sleep(60);
142	print STDERR "Retry attempt $retryCount\n";
143	goto PageOcrRetry;
144	}
145	else {
146	print STDERR "Maximum number of attempts reached. Stopping.\n";
147	exit -1;
148	}
149	}
150
151	}
152	else {
153	print STDERR "Skipping PageOCR Data API request\n";
154	print STDERR "=> Using cached version of file:\n $ofilename\n";
155
156	if (open(JSIN,"<$ofilename")) {
157	binmode(JSIN,":utf8");
158
159	my $line;
160	while (defined ($line=<JSIN>)) {
161	$content .= $line;
162	}
163	close(JSIN);
164	}
165	else {
166	print STDERR "Error: Failed to open cached file $ofilename for input\n";
167	print STDERR " $!\n";
168	}
169	}
170
171	return $content;
172	}
173
174	sub json_structure_data_api
175	{
176	my ($htid,$ofilename) = @_;
177
178	my $json_content = "";
179
180	if (!-f $ofilename) {
181	print STDERR "Downloading METS structure record for $htid\n";
182
183	my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
184	$json_content = $response->content();
185
186	if (open(JSOUT,">$ofilename")) {
187	binmode(JSOUT,":utf8");
188	print JSOUT $json_content;
189	close(JSOUT);
190	}
191	else {
192	print STDERR "Error: Failed to open $ofilename for output\n";
193	print STDERR " $!\n";
194	}
195
196	}
197	else {
198	print STDERR "Skipping Structure Data API request\n";
199	print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
200
201	if (open(JSIN,"<$ofilename")) {
202	binmode(JSIN,":utf8");
203
204	my $line;
205	while (defined ($line=<JSIN>)) {
206	$json_content .= $line;
207	}
208	close(JSIN);
209	}
210	else {
211	print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
212	print STDERR " $!\n";
213	}
214	}
215
216	## print "**** $json_content\n";
217
218	my $json_content_utf8 = Encode::encode("utf8",$json_content);
219	my $json_data = decode_json $json_content_utf8;
220
221	return $json_data;
222
223
224	}
225
226
227	# Example file
228
229	#<PagedDocument>
230	# <Metadata name="Title">Matariki 1881</Metadata>
231	# <Metadata name="Date">18810423</Metadata>
232	# <Metadata name="Number">1</Metadata>
233	# <PageGroup>
234	# <Metadata name="Title">Supplementary Material</Metadata>
235	# <Page txtfile="abstracts/23__1abstract.txt">
236	# <Metadata name="Title">Abstract</Metadata>
237	# </Page>
238	# </PageGroup>
239	# <PageGroup>
240	# <Metadata name="Title">Newspaper pages</Metadata>
241	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
242	# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
243	# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
244	# </PageGroup>
245	#</PagedDocument>
246
247	sub rec_paged_image_structure
248	{
249	my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
250
251	my ($local_output_dir) = ($resource_output_dir =~ m/^.\/(.?)$/);
252
253
254	my $fptr_entry = $this_div->{'METS:fptr'};
255
256	if (defined $this_div->{'METS:div'}) {
257	# Only want Greenstones <PageGroup> tag if not a METS leaf div
258	print PIOUT " " x $depth, "<PageGroup>\n";
259	}
260
261	if (defined $fptr_entry) {
262	# hit a leaf node
263
264	my $fptr_array = undef;
265
266	if (ref $fptr_entry eq "HASH") {
267	$fptr_array = [ $fptr_entry ];
268	}
269	else {
270	$fptr_array = $fptr_entry;
271	}
272
273	my $imgfile = undef;
274	my $txtfile = undef;
275
276
277	foreach my $fptr_hash (@$fptr_array) {
278	my $fileid = $fptr_hash->{'FILEID'};
279
280	## print STDERR "Looking up fileid = $fileid\n";
281
282	my $file = $file_id_map->{$fileid};
283	my $seq = $file->{'SEQ'};
284	my $href = $file->{'METS:FLocat'}->{'xlink:href'};
285
286
287	if ($file->{'USE'} =~ m/\bimage\b/i) {
288	$imgfile = "$local_output_dir/$href";
289	my $full_imgfile = "$resource_output_dir/$href";
290	pageimage_data_api($htid,$seq,$full_imgfile);
291	}
292	elsif ($file->{'USE'} =~ m/\bocr\b/i) {
293	$txtfile = "$local_output_dir/$href";
294	my $full_txtfile = "$resource_output_dir/$href";
295	pageocr_data_api($htid,$seq,$full_txtfile);
296	}
297
298	}
299	# Generate line along the following lines
300
301	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
302	print PIOUT " " x ($depth+1), "<Page ";
303	print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
304	print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
305	print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
306	print PIOUT "/>\n";
307
308	}
309
310	# Now process any child divs
311
312	my $div_entry = $this_div->{'METS:div'};
313
314	if (defined $div_entry) {
315
316	my $div_array = undef;
317
318	if (ref $div_entry eq "HASH") {
319	# upgrade single entry to array
320	$div_array = [ $div_entry ];
321	}
322	else {
323	$div_array = $div_entry;
324	}
325
326	print STDERR "+ Processing ", scalar(@$div_array), " sections\n";
327
328	foreach my $div_hash (@$div_array) {
329
330	my $pagenum = $div_hash->{'ORDER'};
331
332	rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir);
333	}
334	}
335
336	if (defined $this_div->{'METS:div'}) {
337	# Only want Greenstones <PageGroup> tag if not a METS leaf div
338	print PIOUT " " x $depth, "</PageGroup>\n";
339	}
340
341
342	}
343
344	sub generate_paged_image_structure
345	{
346	my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
347
348	print STDERR "Generating PageImage file: $ofilename\n";
349
350	my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
351	if (!-d $resource_output_dir) {
352	mkdir $resource_output_dir;
353	}
354
355	if (open(PIOUT,">$ofilename")) {
356	binmode(PIOUT,":utf8");
357
358	print PIOUT "<PagedDocument>\n";
359	# print PIOUT " <PageGroup>\n";
360
361	rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir);
362
363	# print PIOUT " </PageGroup>\n";
364	print PIOUT "</PagedDocument>\n";
365
366	close(PIOUT);
367	}
368	else {
369	print STDERR "Error: Failed to open $ofilename for output\n";
370	print STDERR " $!\n";
371	}
372
373
374
375	}
376
377
378	my $pdCount = 0;
379
380	sub download_ht_doc
381	{
382	my ($cat_key,$htid,$ofilename) = @_;
383
384	my $json_data = json_structure_data_api($htid,$ofilename);
385
386	# Map in the IDs from:
387	# METS:mets->METS:fileSec->METS:fileGrp
388
389	my $file_sec_ids = {};
390
391	my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
392
393	# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
394
395	foreach my $file_grp (@$file_grp_array) {
396
397	my $use = $file_grp->{'USE'};
398
399	my $file_entry = $file_grp->{'METS:file'};
400
401	my $file_array = undef;
402
403	if (ref $file_entry eq "HASH") {
404	# upgrade single entry into array
405	$file_array = [ $file_entry ];
406	}
407	else {
408	$file_array = $file_entry;
409	}
410
411	# print "**** num files = ", scalar(@$file_array), "\n";
412
413	foreach my $file_hash (@$file_array) {
414	# push file_grp USE attribute down into each file entry (to make file easier later on)
415	$file_hash->{'USE'} = $use;
416
417	my $file_id = $file_hash->{'ID'};
418	$file_sec_ids->{$file_id} = $file_hash;
419
420	# print "file id = $file_id\n";
421	}
422
423	}
424
425	# METS:mets->METS:structMap->{nested METS:div}+
426
427	my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
428	my $toplevel_div = $struct_map_array->{'METS:div'};
429
430	my $pi_filename = $ofilename;
431	$pi_filename =~ s/_structure\.json$/_item.xml/;
432
433	generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
434
435
436	## print "**** json_content = $json_content_utf8\n\n";
437
438	$pdCount++;
439
440	# if ($pdCount>5) {
441	# exit 0;
442	# }
443
444	}
445
446	sub read_json_file
447	{
448	my ($filename) = @_;
449
450	print STDERR "+ Proccessing file: $filename\n";
451
452	my $json_file_content = "";
453	open(JSON_FILE, "<$filename");
454	binmode(JSON_FILE,":utf8");
455
456	my $line;
457	while (defined ($line=<JSON_FILE>)) {
458	$json_file_content .= $line;
459	}
460
461	close(JSON_FILE);
462
463	my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
464	my $json_data = decode_json $json_file_content_utf8;
465
466	my $record_hash = $json_data->{'records'};
467	my @record_keys = keys %$record_hash;
468	my $primary_cat_key = shift @record_keys;
469
470	my $items_entry = $json_data->{'items'};
471	my $items_array;
472
473	print STDERR "*** ref: ", ref $items_entry, "\n\n";
474
475
476	if (ref $items_entry eq "HASH") {
477	$items_array = [ $items_entry ];
478	}
479	else {
480	$items_array = $items_entry;
481	}
482
483	my $num_items = scalar(@$items_array);
484
485	my $num_pd = 0;
486
487	foreach my $item (@$items_array) {
488
489	my $htid = $item->{'htid'};
490	my $rights_code = $item->{'rightsCode'};
491
492	# print "htid = $htid\n";
493	# print "Rights code = $rights_code\n" if defined $rights_code;
494
495	if (defined($rights_code) && ($rights_code eq "pd")) {
496	# in the public domain
497	$num_pd++;
498
499	my $htid_safe = uri_escape($htid);
500
501	my $ofilename = $filename;
502	$ofilename =~ s/\.json/_structure.json/;
503
504	download_ht_doc($primary_cat_key,$htid,$ofilename);
505
506	# bail out at first public domain version of document
507	last;
508	}
509	}
510
511	# if ($num_pd==0) {
512	# print "++ $num_items item(s)\n";
513	# }
514	# else {
515	# print "++ $num_items item(s) of which $num_pd is/are in the public domain\n";
516	# }
517
518	}
519
520
521	sub process_dir
522	{
523	my ($full_dir) = @_;
524
525	# print "Processing directory: $full_dir\n";
526
527	if (opendir(DIN, $full_dir)) {
528	my @dir_content = grep { $_ !~ m/^\./ } sort readdir(DIN);
529	closedir DIN;
530
531	foreach my $df (@dir_content) {
532	my $full_df = "$full_dir/$df";
533	if (-d $full_df) {
534	my $full_sub_dir = $full_df;
535	process_dir($full_sub_dir);
536	}
537	else {
538	# file
539	my $full_file = $full_df;
540	if ($full_file =~ m/\.json$/) {
541	read_json_file($full_file);
542	}
543	}
544	}
545
546	}
547	else {
548
549	print STDERR "Error: Failed to open directory: $full_dir\n";
550	print STDERR " $!\n";
551	}
552
553	}
554
555
556	sub main
557	{
558	my ($argv_ref) = @_;
559
560	my $toplevel_dir = shift @$argv_ref \|\| "output";
561
562
563	$toplevel_dir =~ s/\/$//; # remove any trailing /
564
565	process_dir($toplevel_dir);
566
567	}
568
569	main(\@ARGV);

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26442

Download in other formats: