source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26442

Last change on this file since 26442 was 26442, checked in by davidb, 11 years ago

Further tweaks based on test-runs

File size: 13.2 KB
Line 
1#!/usr/bin/perl -w
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use warnings;
7
8use Encode;
9use JSON;
10
11# use LWP;
12
13use OAuth::Lite::Consumer;
14use OAuth::Lite::AuthMethod;
15
16use URI::Escape;
17
18sub _data_api
19{
20 my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22 my $access_key = '7e6ee38bae';
23 my $secret_key = 'e0429c0394385486249b4a230702';
24
25 my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27 $request_url .= "/$opt_seq" if (defined $opt_seq);
28
29 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30 'consumer_secret' => $secret_key,
31 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33 my $response = $consumer->request( 'method' => 'GET',
34 'url' => $request_url,
35 'params' => $opt_params );
36
37 if (!$response->is_success()) {
38 print STDERR "**** Failed to retrieval any content from URL:\n";
39 print STDERR " ", $consumer->oauth_request->uri, "\n";
40 print "------\n";
41 print STDERR "**** Status: ", $response->status_line, "\n";
42 print "------\n";
43 my $text_only_content = $response->content();
44 $text_only_content =~ s/<[^>]*>//g;
45 $text_only_content =~ s/^\s*$//mg;
46
47 print STDERR "**** Content: $text_only_content\n";
48 print "------\n";
49
50 $response = undef;
51 }
52
53 return $response;
54}
55
56
57sub pageimage_data_api
58{
59 my ($htid,$seq_num,$ofilename) = @_;
60
61 if (!-f $ofilename) {
62 print STDERR "Downloading PageImage $htid/$seq_num\n";
63
64 my $retryCount = 0;
65 PageImageRetry:
66 my $response = _data_api("pageimage",$htid, $seq_num );
67 if (defined $response) {
68 $retryCount = 0; # reset it
69 my $content = $response->content();
70
71 if (open(IMGOUT,">$ofilename")) {
72 binmode(IMGOUT);
73 print IMGOUT $content;
74 close(IMGOUT);
75 }
76 else {
77 print STDERR "Error: Failed to open $ofilename for binary output\n";
78 print STDERR " $!\n";
79 }
80 }
81 else {
82 $retryCount++;
83 print STDERR "Failed to download PageImage\n";
84
85 if ($retryCount<2) {
86 print STDERR "Sleeping to 60 seconds\n";
87 sleep(60);
88 print STDERR "Retry attempt $retryCount\n";
89 goto PageImageRetry;
90 }
91 else {
92 print STDERR "Maximum number of attempts reached. Stopping.\n";
93 exit -1;
94 }
95 }
96
97 }
98 else {
99 print STDERR "Skipping PageImage data API request\n";
100 print STDERR "=> downloaded file $ofilename already exists\n";
101 }
102}
103
104
105
106sub pageocr_data_api
107{
108 my ($htid,$seq_num,$ofilename) = @_;
109
110 my $content = undef;
111
112 if (((defined $ofilename) && (!-f $ofilename))
113 || (!defined $ofilename)) {
114 print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
115
116 my $retryCount = 0;
117 PageOcrRetry:
118
119 my $response = _data_api("pageocr",$htid, $seq_num );
120
121 if (defined $response) {
122 $retryCount = 0; # reset it
123
124 $content = $response->content();
125
126 if (open(TXTOUT,">$ofilename")) {
127 print TXTOUT $content;
128 close(TXTOUT);
129 }
130 else {
131 print STDERR "Error: Failed to open $ofilename for binary output\n";
132 print STDERR " $!\n";
133 }
134 }
135 else {
136 $retryCount++;
137 print STDERR "Failed to download PageOCR\n";
138
139 if ($retryCount<2) {
140 print STDERR "Sleeping to 60 seconds\n";
141 sleep(60);
142 print STDERR "Retry attempt $retryCount\n";
143 goto PageOcrRetry;
144 }
145 else {
146 print STDERR "Maximum number of attempts reached. Stopping.\n";
147 exit -1;
148 }
149 }
150
151 }
152 else {
153 print STDERR "Skipping PageOCR Data API request\n";
154 print STDERR "=> Using cached version of file:\n $ofilename\n";
155
156 if (open(JSIN,"<$ofilename")) {
157 binmode(JSIN,":utf8");
158
159 my $line;
160 while (defined ($line=<JSIN>)) {
161 $content .= $line;
162 }
163 close(JSIN);
164 }
165 else {
166 print STDERR "Error: Failed to open cached file $ofilename for input\n";
167 print STDERR " $!\n";
168 }
169 }
170
171 return $content;
172}
173
174sub json_structure_data_api
175{
176 my ($htid,$ofilename) = @_;
177
178 my $json_content = "";
179
180 if (!-f $ofilename) {
181 print STDERR "Downloading METS structure record for $htid\n";
182
183 my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
184 $json_content = $response->content();
185
186 if (open(JSOUT,">$ofilename")) {
187 binmode(JSOUT,":utf8");
188 print JSOUT $json_content;
189 close(JSOUT);
190 }
191 else {
192 print STDERR "Error: Failed to open $ofilename for output\n";
193 print STDERR " $!\n";
194 }
195
196 }
197 else {
198 print STDERR "Skipping Structure Data API request\n";
199 print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
200
201 if (open(JSIN,"<$ofilename")) {
202 binmode(JSIN,":utf8");
203
204 my $line;
205 while (defined ($line=<JSIN>)) {
206 $json_content .= $line;
207 }
208 close(JSIN);
209 }
210 else {
211 print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
212 print STDERR " $!\n";
213 }
214 }
215
216## print "**** $json_content\n";
217
218 my $json_content_utf8 = Encode::encode("utf8",$json_content);
219 my $json_data = decode_json $json_content_utf8;
220
221 return $json_data;
222
223
224}
225
226
227# Example file
228
229#<PagedDocument>
230# <Metadata name="Title">Matariki 1881</Metadata>
231# <Metadata name="Date">18810423</Metadata>
232# <Metadata name="Number">1</Metadata>
233# <PageGroup>
234# <Metadata name="Title">Supplementary Material</Metadata>
235# <Page txtfile="abstracts/23__1abstract.txt">
236# <Metadata name="Title">Abstract</Metadata>
237# </Page>
238# </PageGroup>
239# <PageGroup>
240# <Metadata name="Title">Newspaper pages</Metadata>
241# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
242# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
243# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
244# </PageGroup>
245#</PagedDocument>
246
247sub rec_paged_image_structure
248{
249 my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
250
251 my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);
252
253
254 my $fptr_entry = $this_div->{'METS:fptr'};
255
256 if (defined $this_div->{'METS:div'}) {
257 # Only want Greenstones <PageGroup> tag if not a METS leaf div
258 print PIOUT " " x $depth, "<PageGroup>\n";
259 }
260
261 if (defined $fptr_entry) {
262 # hit a leaf node
263
264 my $fptr_array = undef;
265
266 if (ref $fptr_entry eq "HASH") {
267 $fptr_array = [ $fptr_entry ];
268 }
269 else {
270 $fptr_array = $fptr_entry;
271 }
272
273 my $imgfile = undef;
274 my $txtfile = undef;
275
276
277 foreach my $fptr_hash (@$fptr_array) {
278 my $fileid = $fptr_hash->{'FILEID'};
279
280## print STDERR "Looking up fileid = $fileid\n";
281
282 my $file = $file_id_map->{$fileid};
283 my $seq = $file->{'SEQ'};
284 my $href = $file->{'METS:FLocat'}->{'xlink:href'};
285
286
287 if ($file->{'USE'} =~ m/\bimage\b/i) {
288 $imgfile = "$local_output_dir/$href";
289 my $full_imgfile = "$resource_output_dir/$href";
290 pageimage_data_api($htid,$seq,$full_imgfile);
291 }
292 elsif ($file->{'USE'} =~ m/\bocr\b/i) {
293 $txtfile = "$local_output_dir/$href";
294 my $full_txtfile = "$resource_output_dir/$href";
295 pageocr_data_api($htid,$seq,$full_txtfile);
296 }
297
298 }
299 # Generate line along the following lines
300
301 # <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
302 print PIOUT " " x ($depth+1), "<Page ";
303 print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
304 print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
305 print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
306 print PIOUT "/>\n";
307
308 }
309
310 # Now process any child divs
311
312 my $div_entry = $this_div->{'METS:div'};
313
314 if (defined $div_entry) {
315
316 my $div_array = undef;
317
318 if (ref $div_entry eq "HASH") {
319 # upgrade single entry to array
320 $div_array = [ $div_entry ];
321 }
322 else {
323 $div_array = $div_entry;
324 }
325
326 print STDERR "+ Processing ", scalar(@$div_array), " sections\n";
327
328 foreach my $div_hash (@$div_array) {
329
330 my $pagenum = $div_hash->{'ORDER'};
331
332 rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir);
333 }
334 }
335
336 if (defined $this_div->{'METS:div'}) {
337 # Only want Greenstones <PageGroup> tag if not a METS leaf div
338 print PIOUT " " x $depth, "</PageGroup>\n";
339 }
340
341
342}
343
344sub generate_paged_image_structure
345{
346 my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
347
348 print STDERR "Generating PageImage file: $ofilename\n";
349
350 my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
351 if (!-d $resource_output_dir) {
352 mkdir $resource_output_dir;
353 }
354
355 if (open(PIOUT,">$ofilename")) {
356 binmode(PIOUT,":utf8");
357
358 print PIOUT "<PagedDocument>\n";
359# print PIOUT " <PageGroup>\n";
360
361 rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir);
362
363# print PIOUT " </PageGroup>\n";
364 print PIOUT "</PagedDocument>\n";
365
366 close(PIOUT);
367 }
368 else {
369 print STDERR "Error: Failed to open $ofilename for output\n";
370 print STDERR " $!\n";
371 }
372
373
374
375}
376
377
378my $pdCount = 0;
379
380sub download_ht_doc
381{
382 my ($cat_key,$htid,$ofilename) = @_;
383
384 my $json_data = json_structure_data_api($htid,$ofilename);
385
386 # Map in the IDs from:
387 # METS:mets->METS:fileSec->METS:fileGrp
388
389 my $file_sec_ids = {};
390
391 my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
392
393# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
394
395 foreach my $file_grp (@$file_grp_array) {
396
397 my $use = $file_grp->{'USE'};
398
399 my $file_entry = $file_grp->{'METS:file'};
400
401 my $file_array = undef;
402
403 if (ref $file_entry eq "HASH") {
404 # upgrade single entry into array
405 $file_array = [ $file_entry ];
406 }
407 else {
408 $file_array = $file_entry;
409 }
410
411# print "**** num files = ", scalar(@$file_array), "\n";
412
413 foreach my $file_hash (@$file_array) {
414 # push file_grp USE attribute down into each file entry (to make file easier later on)
415 $file_hash->{'USE'} = $use;
416
417 my $file_id = $file_hash->{'ID'};
418 $file_sec_ids->{$file_id} = $file_hash;
419
420# print "file id = $file_id\n";
421 }
422
423 }
424
425 # METS:mets->METS:structMap->{nested METS:div}+
426
427 my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
428 my $toplevel_div = $struct_map_array->{'METS:div'};
429
430 my $pi_filename = $ofilename;
431 $pi_filename =~ s/_structure\.json$/_item.xml/;
432
433 generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
434
435
436## print "**** json_content = $json_content_utf8\n\n";
437
438 $pdCount++;
439
440# if ($pdCount>5) {
441# exit 0;
442# }
443
444}
445
446sub read_json_file
447{
448 my ($filename) = @_;
449
450 print STDERR "+ Proccessing file: $filename\n";
451
452 my $json_file_content = "";
453 open(JSON_FILE, "<$filename");
454 binmode(JSON_FILE,":utf8");
455
456 my $line;
457 while (defined ($line=<JSON_FILE>)) {
458 $json_file_content .= $line;
459 }
460
461 close(JSON_FILE);
462
463 my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
464 my $json_data = decode_json $json_file_content_utf8;
465
466 my $record_hash = $json_data->{'records'};
467 my @record_keys = keys %$record_hash;
468 my $primary_cat_key = shift @record_keys;
469
470 my $items_entry = $json_data->{'items'};
471 my $items_array;
472
473 print STDERR "*** ref: ", ref $items_entry, "\n\n";
474
475
476 if (ref $items_entry eq "HASH") {
477 $items_array = [ $items_entry ];
478 }
479 else {
480 $items_array = $items_entry;
481 }
482
483 my $num_items = scalar(@$items_array);
484
485 my $num_pd = 0;
486
487 foreach my $item (@$items_array) {
488
489 my $htid = $item->{'htid'};
490 my $rights_code = $item->{'rightsCode'};
491
492# print "htid = $htid\n";
493# print "Rights code = $rights_code\n" if defined $rights_code;
494
495 if (defined($rights_code) && ($rights_code eq "pd")) {
496 # in the public domain
497 $num_pd++;
498
499 my $htid_safe = uri_escape($htid);
500
501 my $ofilename = $filename;
502 $ofilename =~ s/\.json/_structure.json/;
503
504 download_ht_doc($primary_cat_key,$htid,$ofilename);
505
506 # bail out at first public domain version of document
507 last;
508 }
509 }
510
511# if ($num_pd==0) {
512# print "++ $num_items item(s)\n";
513# }
514# else {
515# print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
516# }
517
518}
519
520
521sub process_dir
522{
523 my ($full_dir) = @_;
524
525# print "Processing directory: $full_dir\n";
526
527 if (opendir(DIN, $full_dir)) {
528 my @dir_content = grep { $_ !~ m/^\./ } sort readdir(DIN);
529 closedir DIN;
530
531 foreach my $df (@dir_content) {
532 my $full_df = "$full_dir/$df";
533 if (-d $full_df) {
534 my $full_sub_dir = $full_df;
535 process_dir($full_sub_dir);
536 }
537 else {
538 # file
539 my $full_file = $full_df;
540 if ($full_file =~ m/\.json$/) {
541 read_json_file($full_file);
542 }
543 }
544 }
545
546 }
547 else {
548
549 print STDERR "Error: Failed to open directory: $full_dir\n";
550 print STDERR " $!\n";
551 }
552
553}
554
555
556sub main
557{
558 my ($argv_ref) = @_;
559
560 my $toplevel_dir = shift @$argv_ref || "output";
561
562
563 $toplevel_dir =~ s/\/$//; # remove any trailing /
564
565 process_dir($toplevel_dir);
566
567}
568
569main(\@ARGV);
Note: See TracBrowser for help on using the repository browser.