root/gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl @ 26436

Revision 26436, 11.5 KB (checked in by davidb, 7 years ago)

Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone

Line 
1#!/usr/bin/perl -w
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use warnings;
7
8use Encode;
9use JSON;
10
11# use LWP;
12
13use OAuth::Lite::Consumer;
14use OAuth::Lite::AuthMethod;
15
16use URI::Escape;
17
18sub _data_api
19{
20    my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22    my $access_key = '7e6ee38bae';                   
23    my $secret_key = 'e0429c0394385486249b4a230702';
24
25    my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27    $request_url .= "/$opt_seq" if (defined $opt_seq);
28
29    my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30                           'consumer_secret' => $secret_key,
31                           'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33    my $response = $consumer->request( 'method' => 'GET',
34                       'url' => $request_url,
35                       'params' => $opt_params );
36   
37    if (!$response->is_success()) {
38    print STDERR "**** Failed to retrieval any content from URL:\n";
39    print STDERR "         ", $consumer->oauth_request->uri, "\n";
40    print "------\n";
41    print STDERR "**** Status:  ", $response->status_line, "\n";
42    print "------\n";
43    print STDERR "**** Content: ", $response->content, "\n";
44    print "------\n";
45   
46    $response = undef;
47    }
48
49    return $response;
50}
51
52
53sub pageimage_data_api
54{
55    my ($htid,$seq_num,$ofilename) = @_;
56
57    if (!-f $ofilename) {
58    print STDERR "Downloading PageImage $htid/$seq_num\n";
59
60    my $response = _data_api("pageimage",$htid, $seq_num );
61    my $content = $response->content();
62
63    if (open(IMGOUT,">$ofilename")) {       
64        binmode(IMGOUT);       
65        print IMGOUT $content;
66        close(IMGOUT);
67    }
68    else {
69        print STDERR "Error: Failed to open $ofilename for binary output\n";
70        print STDERR "       $!\n";
71    }   
72    }
73    else {
74    print STDERR "Skipping PageImage data API request\n";
75    print STDERR "=> downloaded file $ofilename already exists\n";
76    }
77}
78   
79
80
81sub pageocr_data_api
82{
83    my ($htid,$seq_num,$ofilename) = @_;
84
85    my $content = undef;
86
87    if (((defined $ofilename) && (!-f $ofilename))
88    || (!defined $ofilename)) {
89    print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
90       
91    my $response = _data_api("pageocr",$htid, $seq_num );
92    $content = $response->content();
93   
94    if (open(TXTOUT,">$ofilename")) {       
95        print TXTOUT $content;
96        close(TXTOUT);
97    }
98    else {
99        print STDERR "Error: Failed to open $ofilename for binary output\n";
100        print STDERR "       $!\n";
101    }   
102    }
103    else {
104    print STDERR "Skipping PageOCR Data API request\n";
105    print STDERR "=> Using cached version of file:\n    $ofilename\n";
106   
107    if (open(JSIN,"<$ofilename")) {
108        binmode(JSIN,":utf8");
109
110        my $line;
111        while (defined ($line=<JSIN>)) {
112        $content .= $line;
113        }
114        close(JSIN);
115    }
116    else {
117        print STDERR "Error: Failed to open cached file $ofilename for input\n";
118        print STDERR "       $!\n";
119    }
120    }
121
122    return $content;
123}
124   
125sub json_structure_data_api
126{
127    my ($htid,$ofilename) = @_;
128
129    my $json_content = "";
130
131    if (!-f $ofilename) {
132    print STDERR "Downloading METS structure record for $htid\n";
133
134    my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
135    $json_content = $response->content();
136
137    if (open(JSOUT,">$ofilename")) {
138        binmode(JSOUT,":utf8");
139        print JSOUT $json_content;
140        close(JSOUT);
141    }
142    else {
143        print STDERR "Error: Failed to open $ofilename for output\n";
144        print STDERR "       $!\n";
145    }
146   
147    }
148    else {
149    print STDERR "Skipping Structure Data API request\n";
150    print STDERR "=> Using cached version of JSON structure file:\n    $ofilename\n";
151
152    if (open(JSIN,"<$ofilename")) {
153        binmode(JSIN,":utf8");
154
155        my $line;
156        while (defined ($line=<JSIN>)) {
157        $json_content .= $line;
158        }
159        close(JSIN);
160    }
161    else {
162        print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
163        print STDERR "       $!\n";
164    }
165    }
166
167##    print "**** $json_content\n";
168
169    my $json_content_utf8 = Encode::encode("utf8",$json_content);
170    my $json_data = decode_json $json_content_utf8;
171
172    return $json_data;
173
174
175}
176
177
178# Example file
179
180#<PagedDocument>
181#  <Metadata name="Title">Matariki 1881</Metadata>
182#  <Metadata name="Date">18810423</Metadata>
183#  <Metadata name="Number">1</Metadata>
184#  <PageGroup>
185#    <Metadata name="Title">Supplementary Material</Metadata>
186#    <Page txtfile="abstracts/23__1abstract.txt">
187#      <Metadata name="Title">Abstract</Metadata>
188#    </Page>
189#  </PageGroup>
190#  <PageGroup>
191#    <Metadata name="Title">Newspaper pages</Metadata>
192#    <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
193#    <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
194#    <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
195#  </PageGroup>
196#</PagedDocument>
197
198sub rec_paged_image_structure
199{
200    my ($this_div,$pagenum,$elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
201
202    my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);
203
204    print PIOUT "  " x $depth, "<$elem_name>\n";
205
206    my $fptr_entry = $this_div->{'METS:fptr'};
207   
208    if (defined $fptr_entry) {
209    # hit a leaf node
210
211    my $fptr_array = undef;
212
213    if (ref $fptr_entry eq "HASH") {
214        $fptr_array = [ $fptr_entry ];
215    }
216    else  {
217        $fptr_array = $fptr_entry;
218    }
219
220    my $imgfile = undef;
221    my $txtfile = undef;
222
223    foreach my $fptr_hash (@$fptr_array) {
224        my $fileid = $fptr_hash->{'FILEID'};
225
226##      print STDERR "Looking up fileid = $fileid\n";
227
228        my $file = $file_id_map->{$fileid};
229        my $seq  = $file->{'SEQ'};
230        my $href = $file->{'METS:FLocat'}->{'xlink:href'};
231
232        if ($file->{'USE'} =~ m/\bimage\b/i) {     
233        $imgfile = "$local_output_dir/$href";
234        my $full_imgfile = "$resource_output_dir/$href";
235        pageimage_data_api($htid,$seq,$full_imgfile);
236        }
237        elsif ($file->{'USE'} =~ m/\bocr\b/i) {
238        $txtfile = "$local_output_dir/$href";
239        my $full_txtfile = "$resource_output_dir/$href";
240        pageocr_data_api($htid,$seq,$full_txtfile);
241        }
242    }
243    # Generate line along the following lines
244
245    #  <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
246    print PIOUT "  " x ($depth+1), "<Page ";
247    print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
248    print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
249    print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
250    print PIOUT "/>\n";
251   
252    }
253       
254    # Now process any child divs
255
256    my $div_entry = $this_div->{'METS:div'};
257
258    if (defined $div_entry) {
259
260    my $div_array = undef;
261
262    if (ref $div_entry eq "HASH") {
263        # upgrade single entry to array
264        $div_array = [ $div_entry ];
265    }
266    else {
267        $div_array = $div_entry;
268    }
269
270    foreach my $div_hash (@$div_array) {
271
272        my $pagenum = $div_hash->{'ORDER'};
273       
274        rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir);
275    }
276    }
277
278
279    print PIOUT "  " x $depth, "</$elem_name>\n";
280
281}
282
283sub generate_paged_image_structure
284{
285    my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
286   
287    print STDERR "Generating PageImage file: $ofilename\n";
288
289    my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
290    if (!-d $resource_output_dir) {
291    mkdir $resource_output_dir;
292    }
293   
294    if (open(PIOUT,">$ofilename")) {
295    binmode(PIOUT,":utf8");
296
297    rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir);
298
299    close(PIOUT);
300    }
301    else {
302    print STDERR "Error: Failed to open $ofilename for output\n";
303    print STDERR "       $!\n";
304    }
305
306
307
308}
309
310
311sub download_ht_doc
312{
313    my ($cat_key,$htid,$ofilename) = @_;
314
315    my $json_data = json_structure_data_api($htid,$ofilename);
316
317    # Map in the IDs from:
318    # METS:mets->METS:fileSec->METS:fileGrp
319
320    my $file_sec_ids = {};
321   
322    my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
323
324#    print "**** num file grps = ", scalar(@$file_grp_array), "\n";
325
326    foreach my $file_grp (@$file_grp_array) {
327
328    my $use = $file_grp->{'USE'};
329
330    my $file_entry = $file_grp->{'METS:file'};
331
332    my $file_array = undef;
333
334    if (ref $file_entry eq "HASH") {
335        # upgrade single entry into array
336        $file_array = [ $file_entry ];
337    }
338    else {
339        $file_array = $file_entry;
340    }
341
342#       print "**** num files = ", scalar(@$file_array), "\n";
343
344    foreach my $file_hash (@$file_array) {
345        # push file_grp USE attribute down into each file entry (to make file easier later on)
346        $file_hash->{'USE'} = $use;
347
348        my $file_id = $file_hash->{'ID'};
349        $file_sec_ids->{$file_id} = $file_hash;
350       
351#       print "file id = $file_id\n";
352    }
353
354    }
355
356    # METS:mets->METS:structMap->{nested METS:div}+
357
358    my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
359    my $toplevel_div = $struct_map_array->{'METS:div'};
360
361    my $pi_filename = $ofilename;
362    $pi_filename =~ s/_structure\.json$/_item.xml/;
363
364    generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
365
366
367##    print "**** json_content = $json_content_utf8\n\n";
368
369    exit 0;
370
371}
372
373sub read_json_file
374{
375    my ($filename) = @_;
376
377    print STDERR "+ Proccessing file: $filename\n";
378
379    my $json_file_content = "";
380    open(JSON_FILE, "<$filename");
381    binmode(JSON_FILE,":utf8");
382
383    my $line;
384    while (defined ($line=<JSON_FILE>)) {
385    $json_file_content .= $line;
386    }
387
388    close(JSON_FILE);
389
390    my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
391    my $json_data = decode_json $json_file_content_utf8;
392   
393    my $record_hash = $json_data->{'records'};
394    my @record_keys = keys %$record_hash;
395    my $primary_cat_key = shift @record_keys;
396
397    my $items_array = $json_data->{'items'};
398    my $num_items = scalar(@$items_array);
399
400    my $num_pd = 0;
401
402    foreach my $item (@$items_array) {
403   
404    my $htid = $item->{'htid'};
405    my $rights_code = $item->{'rightsCode'};
406
407#   print "htid = $htid\n";
408#   print "Rights code = $rights_code\n" if defined $rights_code;
409
410    if (defined($rights_code) && ($rights_code eq "pd")) {
411        # in the public domain
412        $num_pd++;
413
414        my $htid_safe = uri_escape($htid);
415
416        my $ofilename = $filename;
417        $ofilename =~ s/\.json/_structure.json/;
418
419        download_ht_doc($primary_cat_key,$htid,$ofilename);
420
421        # bail out at first public domain version of document
422        last;
423    }
424    }
425
426#    if ($num_pd==0) {
427#   print "++ $num_items item(s)\n";
428#    }
429#    else {
430#   print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
431#    }
432
433}
434
435
436sub process_dir
437{
438    my ($full_dir) = @_;
439
440#    print "Processing directory: $full_dir\n";
441   
442    if (opendir(DIN, $full_dir)) {
443    my @dir_content = grep { $_ !~ m/^\./ }  readdir(DIN);
444    closedir DIN;
445
446    foreach my $df (@dir_content) {
447        my $full_df = "$full_dir/$df";
448        if (-d $full_df) {
449        my $full_sub_dir = $full_df;
450        process_dir($full_sub_dir);
451        }
452        else {
453        # file
454        my $full_file = $full_df;
455        if ($full_file =~ m/\.json$/) {
456            read_json_file($full_file);
457        }
458        }
459    }
460
461    }
462    else {
463   
464    print STDERR "Error: Failed to open directory: $full_dir\n";
465    print STDERR "       $!\n";
466    }
467
468}
469
470
471sub main
472{
473    my ($argv_ref) = @_;
474
475    my $toplevel_dir = shift @$argv_ref || "output";
476   
477
478    $toplevel_dir =~ s/\/$//; # remove any trailing /
479
480    process_dir($toplevel_dir);
481
482}
483
484main(\@ARGV);
Note: See TracBrowser for help on using the browser.