root/gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl @ 26442

Revision 26442, 13.2 KB (checked in by davidb, 7 years ago)

Further tweaks based on test-runs

Line 
1#!/usr/bin/perl -w
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use warnings;
7
8use Encode;
9use JSON;
10
11# use LWP;
12
13use OAuth::Lite::Consumer;
14use OAuth::Lite::AuthMethod;
15
16use URI::Escape;
17
18sub _data_api
19{
20    my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22    my $access_key = '7e6ee38bae';                   
23    my $secret_key = 'e0429c0394385486249b4a230702';
24
25    my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27    $request_url .= "/$opt_seq" if (defined $opt_seq);
28
29    my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30                           'consumer_secret' => $secret_key,
31                           'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33    my $response = $consumer->request( 'method' => 'GET',
34                       'url' => $request_url,
35                       'params' => $opt_params );
36   
37    if (!$response->is_success()) {
38    print STDERR "**** Failed to retrieval any content from URL:\n";
39    print STDERR "         ", $consumer->oauth_request->uri, "\n";
40    print "------\n";
41    print STDERR "**** Status:  ", $response->status_line, "\n";
42    print "------\n";
43    my $text_only_content = $response->content();
44    $text_only_content =~ s/<[^>]*>//g;
45    $text_only_content =~ s/^\s*$//mg;
46
47    print STDERR "**** Content: $text_only_content\n";
48    print "------\n";
49   
50    $response = undef;
51    }
52
53    return $response;
54}
55
56
57sub pageimage_data_api
58{
59    my ($htid,$seq_num,$ofilename) = @_;
60
61    if (!-f $ofilename) {
62    print STDERR "Downloading PageImage $htid/$seq_num\n";
63
64    my $retryCount = 0;
65      PageImageRetry:
66    my $response = _data_api("pageimage",$htid, $seq_num );
67    if (defined $response) {
68        $retryCount = 0; # reset it
69        my $content = $response->content();
70
71        if (open(IMGOUT,">$ofilename")) {       
72        binmode(IMGOUT);       
73        print IMGOUT $content;
74        close(IMGOUT);
75        }
76        else {
77        print STDERR "Error: Failed to open $ofilename for binary output\n";
78        print STDERR "       $!\n";
79        }   
80    }
81    else {
82        $retryCount++;
83        print STDERR "Failed to download PageImage\n";
84
85        if ($retryCount<2) {
86        print STDERR "Sleeping to 60 seconds\n";
87        sleep(60);
88        print STDERR "Retry attempt $retryCount\n";
89        goto PageImageRetry;
90        }
91        else {
92        print STDERR "Maximum number of attempts reached.  Stopping.\n";
93        exit -1;
94        }
95    }
96   
97    }
98    else {
99    print STDERR "Skipping PageImage data API request\n";
100    print STDERR "=> downloaded file $ofilename already exists\n";
101    }
102}
103   
104
105
106sub pageocr_data_api
107{
108    my ($htid,$seq_num,$ofilename) = @_;
109
110    my $content = undef;
111
112    if (((defined $ofilename) && (!-f $ofilename))
113    || (!defined $ofilename)) {
114    print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
115
116    my $retryCount = 0;
117      PageOcrRetry:
118
119    my $response = _data_api("pageocr",$htid, $seq_num );
120
121    if (defined $response) {
122        $retryCount = 0; # reset it
123
124        $content = $response->content();
125       
126        if (open(TXTOUT,">$ofilename")) {       
127        print TXTOUT $content;
128        close(TXTOUT);
129        }
130        else {
131        print STDERR "Error: Failed to open $ofilename for binary output\n";
132        print STDERR "       $!\n";
133        }   
134    }
135    else {
136        $retryCount++;
137        print STDERR "Failed to download PageOCR\n";
138
139        if ($retryCount<2) {
140        print STDERR "Sleeping to 60 seconds\n";
141        sleep(60);
142        print STDERR "Retry attempt $retryCount\n";
143        goto PageOcrRetry;
144        }
145        else {
146        print STDERR "Maximum number of attempts reached.  Stopping.\n";
147        exit -1;
148        }
149    }
150   
151    }
152    else {
153    print STDERR "Skipping PageOCR Data API request\n";
154    print STDERR "=> Using cached version of file:\n    $ofilename\n";
155   
156    if (open(JSIN,"<$ofilename")) {
157        binmode(JSIN,":utf8");
158
159        my $line;
160        while (defined ($line=<JSIN>)) {
161        $content .= $line;
162        }
163        close(JSIN);
164    }
165    else {
166        print STDERR "Error: Failed to open cached file $ofilename for input\n";
167        print STDERR "       $!\n";
168    }
169    }
170
171    return $content;
172}
173   
174sub json_structure_data_api
175{
176    my ($htid,$ofilename) = @_;
177
178    my $json_content = "";
179
180    if (!-f $ofilename) {
181    print STDERR "Downloading METS structure record for $htid\n";
182
183    my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
184    $json_content = $response->content();
185
186    if (open(JSOUT,">$ofilename")) {
187        binmode(JSOUT,":utf8");
188        print JSOUT $json_content;
189        close(JSOUT);
190    }
191    else {
192        print STDERR "Error: Failed to open $ofilename for output\n";
193        print STDERR "       $!\n";
194    }
195   
196    }
197    else {
198    print STDERR "Skipping Structure Data API request\n";
199    print STDERR "=> Using cached version of JSON structure file:\n    $ofilename\n";
200
201    if (open(JSIN,"<$ofilename")) {
202        binmode(JSIN,":utf8");
203
204        my $line;
205        while (defined ($line=<JSIN>)) {
206        $json_content .= $line;
207        }
208        close(JSIN);
209    }
210    else {
211        print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
212        print STDERR "       $!\n";
213    }
214    }
215
216##    print "**** $json_content\n";
217
218    my $json_content_utf8 = Encode::encode("utf8",$json_content);
219    my $json_data = decode_json $json_content_utf8;
220
221    return $json_data;
222
223
224}
225
226
227# Example file
228
229#<PagedDocument>
230#  <Metadata name="Title">Matariki 1881</Metadata>
231#  <Metadata name="Date">18810423</Metadata>
232#  <Metadata name="Number">1</Metadata>
233#  <PageGroup>
234#    <Metadata name="Title">Supplementary Material</Metadata>
235#    <Page txtfile="abstracts/23__1abstract.txt">
236#      <Metadata name="Title">Abstract</Metadata>
237#    </Page>
238#  </PageGroup>
239#  <PageGroup>
240#    <Metadata name="Title">Newspaper pages</Metadata>
241#    <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
242#    <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
243#    <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
244#  </PageGroup>
245#</PagedDocument>
246
247sub rec_paged_image_structure
248{
249    my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
250
251    my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);
252
253
254    my $fptr_entry = $this_div->{'METS:fptr'};
255
256    if (defined $this_div->{'METS:div'}) {
257    # Only want Greenstones <PageGroup> tag if not a METS leaf div
258    print PIOUT "  " x $depth, "<PageGroup>\n";
259    }
260   
261    if (defined $fptr_entry) {
262    # hit a leaf node
263
264    my $fptr_array = undef;
265
266    if (ref $fptr_entry eq "HASH") {
267        $fptr_array = [ $fptr_entry ];
268    }
269    else  {
270        $fptr_array = $fptr_entry;
271    }
272
273    my $imgfile = undef;
274    my $txtfile = undef;
275
276
277    foreach my $fptr_hash (@$fptr_array) {
278        my $fileid = $fptr_hash->{'FILEID'};
279
280##      print STDERR "Looking up fileid = $fileid\n";
281
282        my $file = $file_id_map->{$fileid};
283        my $seq  = $file->{'SEQ'};
284        my $href = $file->{'METS:FLocat'}->{'xlink:href'};
285
286
287        if ($file->{'USE'} =~ m/\bimage\b/i) {     
288        $imgfile = "$local_output_dir/$href";
289        my $full_imgfile = "$resource_output_dir/$href";
290        pageimage_data_api($htid,$seq,$full_imgfile);
291        }
292        elsif ($file->{'USE'} =~ m/\bocr\b/i) {
293        $txtfile = "$local_output_dir/$href";
294        my $full_txtfile = "$resource_output_dir/$href";
295        pageocr_data_api($htid,$seq,$full_txtfile);
296        }
297
298    }
299    # Generate line along the following lines
300
301    #  <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
302    print PIOUT "  " x ($depth+1), "<Page ";
303    print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
304    print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
305    print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
306    print PIOUT "/>\n";
307   
308    }
309       
310    # Now process any child divs
311
312    my $div_entry = $this_div->{'METS:div'};
313
314    if (defined $div_entry) {
315
316    my $div_array = undef;
317
318    if (ref $div_entry eq "HASH") {
319        # upgrade single entry to array
320        $div_array = [ $div_entry ];
321    }
322    else {
323        $div_array = $div_entry;
324    }
325
326    print STDERR "+ Processing ", scalar(@$div_array), " sections\n";
327
328    foreach my $div_hash (@$div_array) {
329
330        my $pagenum = $div_hash->{'ORDER'};
331       
332        rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir);
333    }
334    }
335
336    if (defined $this_div->{'METS:div'}) {
337    # Only want Greenstones <PageGroup> tag if not a METS leaf div
338    print PIOUT "  " x $depth, "</PageGroup>\n";
339    }
340
341
342}
343
344sub generate_paged_image_structure
345{
346    my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
347   
348    print STDERR "Generating PageImage file: $ofilename\n";
349
350    my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
351    if (!-d $resource_output_dir) {
352    mkdir $resource_output_dir;
353    }
354   
355    if (open(PIOUT,">$ofilename")) {
356    binmode(PIOUT,":utf8");
357   
358    print PIOUT "<PagedDocument>\n";
359#   print PIOUT "  <PageGroup>\n";
360
361    rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir);
362
363#   print PIOUT "  </PageGroup>\n";
364    print PIOUT "</PagedDocument>\n";
365
366    close(PIOUT);
367    }
368    else {
369    print STDERR "Error: Failed to open $ofilename for output\n";
370    print STDERR "       $!\n";
371    }
372
373
374
375}
376
377
378my $pdCount = 0;
379
380sub download_ht_doc
381{
382    my ($cat_key,$htid,$ofilename) = @_;
383
384    my $json_data = json_structure_data_api($htid,$ofilename);
385
386    # Map in the IDs from:
387    # METS:mets->METS:fileSec->METS:fileGrp
388
389    my $file_sec_ids = {};
390   
391    my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
392
393#    print "**** num file grps = ", scalar(@$file_grp_array), "\n";
394
395    foreach my $file_grp (@$file_grp_array) {
396
397    my $use = $file_grp->{'USE'};
398
399    my $file_entry = $file_grp->{'METS:file'};
400
401    my $file_array = undef;
402
403    if (ref $file_entry eq "HASH") {
404        # upgrade single entry into array
405        $file_array = [ $file_entry ];
406    }
407    else {
408        $file_array = $file_entry;
409    }
410
411#       print "**** num files = ", scalar(@$file_array), "\n";
412
413    foreach my $file_hash (@$file_array) {
414        # push file_grp USE attribute down into each file entry (to make file easier later on)
415        $file_hash->{'USE'} = $use;
416
417        my $file_id = $file_hash->{'ID'};
418        $file_sec_ids->{$file_id} = $file_hash;
419       
420#       print "file id = $file_id\n";
421    }
422
423    }
424
425    # METS:mets->METS:structMap->{nested METS:div}+
426
427    my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
428    my $toplevel_div = $struct_map_array->{'METS:div'};
429
430    my $pi_filename = $ofilename;
431    $pi_filename =~ s/_structure\.json$/_item.xml/;
432
433    generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
434
435
436##    print "**** json_content = $json_content_utf8\n\n";
437
438    $pdCount++;
439
440#    if ($pdCount>5) {
441#   exit 0;
442#    }
443
444}
445
446sub read_json_file
447{
448    my ($filename) = @_;
449
450    print STDERR "+ Proccessing file: $filename\n";
451
452    my $json_file_content = "";
453    open(JSON_FILE, "<$filename");
454    binmode(JSON_FILE,":utf8");
455
456    my $line;
457    while (defined ($line=<JSON_FILE>)) {
458    $json_file_content .= $line;
459    }
460
461    close(JSON_FILE);
462
463    my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
464    my $json_data = decode_json $json_file_content_utf8;
465   
466    my $record_hash = $json_data->{'records'};
467    my @record_keys = keys %$record_hash;
468    my $primary_cat_key = shift @record_keys;
469
470    my $items_entry = $json_data->{'items'};
471    my $items_array;
472
473    print STDERR "*** ref: ", ref $items_entry, "\n\n";
474
475
476    if (ref $items_entry eq "HASH") {
477    $items_array = [ $items_entry ];
478    }
479    else {
480    $items_array = $items_entry;
481    }
482
483    my $num_items = scalar(@$items_array);
484
485    my $num_pd = 0;
486
487    foreach my $item (@$items_array) {
488   
489    my $htid = $item->{'htid'};
490    my $rights_code = $item->{'rightsCode'};
491
492#   print "htid = $htid\n";
493#   print "Rights code = $rights_code\n" if defined $rights_code;
494
495    if (defined($rights_code) && ($rights_code eq "pd")) {
496        # in the public domain
497        $num_pd++;
498
499        my $htid_safe = uri_escape($htid);
500
501        my $ofilename = $filename;
502        $ofilename =~ s/\.json/_structure.json/;
503
504        download_ht_doc($primary_cat_key,$htid,$ofilename);
505
506        # bail out at first public domain version of document
507        last;
508    }
509    }
510
511#    if ($num_pd==0) {
512#   print "++ $num_items item(s)\n";
513#    }
514#    else {
515#   print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
516#    }
517
518}
519
520
521sub process_dir
522{
523    my ($full_dir) = @_;
524
525#    print "Processing directory: $full_dir\n";
526   
527    if (opendir(DIN, $full_dir)) {
528    my @dir_content = grep { $_ !~ m/^\./ }  sort readdir(DIN);
529    closedir DIN;
530
531    foreach my $df (@dir_content) {
532        my $full_df = "$full_dir/$df";
533        if (-d $full_df) {
534        my $full_sub_dir = $full_df;
535        process_dir($full_sub_dir);
536        }
537        else {
538        # file
539        my $full_file = $full_df;
540        if ($full_file =~ m/\.json$/) {
541            read_json_file($full_file);
542        }
543        }
544    }
545
546    }
547    else {
548   
549    print STDERR "Error: Failed to open directory: $full_dir\n";
550    print STDERR "       $!\n";
551    }
552
553}
554
555
556sub main
557{
558    my ($argv_ref) = @_;
559
560    my $toplevel_dir = shift @$argv_ref || "output";
561   
562
563    $toplevel_dir =~ s/\/$//; # remove any trailing /
564
565    process_dir($toplevel_dir);
566
567}
568
569main(\@ARGV);
Note: See TracBrowser for help on using the browser.