source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 8 years ago

Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone

File size: 11.5 KB
Line 
1#!/usr/bin/perl -w
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use warnings;
7
8use Encode;
9use JSON;
10
11# use LWP;
12
13use OAuth::Lite::Consumer;
14use OAuth::Lite::AuthMethod;
15
16use URI::Escape;
17
18sub _data_api
19{
20 my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22 my $access_key = '7e6ee38bae';
23 my $secret_key = 'e0429c0394385486249b4a230702';
24
25 my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27 $request_url .= "/$opt_seq" if (defined $opt_seq);
28
29 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30 'consumer_secret' => $secret_key,
31 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33 my $response = $consumer->request( 'method' => 'GET',
34 'url' => $request_url,
35 'params' => $opt_params );
36
37 if (!$response->is_success()) {
38 print STDERR "**** Failed to retrieval any content from URL:\n";
39 print STDERR " ", $consumer->oauth_request->uri, "\n";
40 print "------\n";
41 print STDERR "**** Status: ", $response->status_line, "\n";
42 print "------\n";
43 print STDERR "**** Content: ", $response->content, "\n";
44 print "------\n";
45
46 $response = undef;
47 }
48
49 return $response;
50}
51
52
53sub pageimage_data_api
54{
55 my ($htid,$seq_num,$ofilename) = @_;
56
57 if (!-f $ofilename) {
58 print STDERR "Downloading PageImage $htid/$seq_num\n";
59
60 my $response = _data_api("pageimage",$htid, $seq_num );
61 my $content = $response->content();
62
63 if (open(IMGOUT,">$ofilename")) {
64 binmode(IMGOUT);
65 print IMGOUT $content;
66 close(IMGOUT);
67 }
68 else {
69 print STDERR "Error: Failed to open $ofilename for binary output\n";
70 print STDERR " $!\n";
71 }
72 }
73 else {
74 print STDERR "Skipping PageImage data API request\n";
75 print STDERR "=> downloaded file $ofilename already exists\n";
76 }
77}
78
79
80
81sub pageocr_data_api
82{
83 my ($htid,$seq_num,$ofilename) = @_;
84
85 my $content = undef;
86
87 if (((defined $ofilename) && (!-f $ofilename))
88 || (!defined $ofilename)) {
89 print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
90
91 my $response = _data_api("pageocr",$htid, $seq_num );
92 $content = $response->content();
93
94 if (open(TXTOUT,">$ofilename")) {
95 print TXTOUT $content;
96 close(TXTOUT);
97 }
98 else {
99 print STDERR "Error: Failed to open $ofilename for binary output\n";
100 print STDERR " $!\n";
101 }
102 }
103 else {
104 print STDERR "Skipping PageOCR Data API request\n";
105 print STDERR "=> Using cached version of file:\n $ofilename\n";
106
107 if (open(JSIN,"<$ofilename")) {
108 binmode(JSIN,":utf8");
109
110 my $line;
111 while (defined ($line=<JSIN>)) {
112 $content .= $line;
113 }
114 close(JSIN);
115 }
116 else {
117 print STDERR "Error: Failed to open cached file $ofilename for input\n";
118 print STDERR " $!\n";
119 }
120 }
121
122 return $content;
123}
124
125sub json_structure_data_api
126{
127 my ($htid,$ofilename) = @_;
128
129 my $json_content = "";
130
131 if (!-f $ofilename) {
132 print STDERR "Downloading METS structure record for $htid\n";
133
134 my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
135 $json_content = $response->content();
136
137 if (open(JSOUT,">$ofilename")) {
138 binmode(JSOUT,":utf8");
139 print JSOUT $json_content;
140 close(JSOUT);
141 }
142 else {
143 print STDERR "Error: Failed to open $ofilename for output\n";
144 print STDERR " $!\n";
145 }
146
147 }
148 else {
149 print STDERR "Skipping Structure Data API request\n";
150 print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
151
152 if (open(JSIN,"<$ofilename")) {
153 binmode(JSIN,":utf8");
154
155 my $line;
156 while (defined ($line=<JSIN>)) {
157 $json_content .= $line;
158 }
159 close(JSIN);
160 }
161 else {
162 print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
163 print STDERR " $!\n";
164 }
165 }
166
167## print "**** $json_content\n";
168
169 my $json_content_utf8 = Encode::encode("utf8",$json_content);
170 my $json_data = decode_json $json_content_utf8;
171
172 return $json_data;
173
174
175}
176
177
178# Example file
179
180#<PagedDocument>
181# <Metadata name="Title">Matariki 1881</Metadata>
182# <Metadata name="Date">18810423</Metadata>
183# <Metadata name="Number">1</Metadata>
184# <PageGroup>
185# <Metadata name="Title">Supplementary Material</Metadata>
186# <Page txtfile="abstracts/23__1abstract.txt">
187# <Metadata name="Title">Abstract</Metadata>
188# </Page>
189# </PageGroup>
190# <PageGroup>
191# <Metadata name="Title">Newspaper pages</Metadata>
192# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
193# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
194# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
195# </PageGroup>
196#</PagedDocument>
197
198sub rec_paged_image_structure
199{
200 my ($this_div,$pagenum,$elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
201
202 my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);
203
204 print PIOUT " " x $depth, "<$elem_name>\n";
205
206 my $fptr_entry = $this_div->{'METS:fptr'};
207
208 if (defined $fptr_entry) {
209 # hit a leaf node
210
211 my $fptr_array = undef;
212
213 if (ref $fptr_entry eq "HASH") {
214 $fptr_array = [ $fptr_entry ];
215 }
216 else {
217 $fptr_array = $fptr_entry;
218 }
219
220 my $imgfile = undef;
221 my $txtfile = undef;
222
223 foreach my $fptr_hash (@$fptr_array) {
224 my $fileid = $fptr_hash->{'FILEID'};
225
226## print STDERR "Looking up fileid = $fileid\n";
227
228 my $file = $file_id_map->{$fileid};
229 my $seq = $file->{'SEQ'};
230 my $href = $file->{'METS:FLocat'}->{'xlink:href'};
231
232 if ($file->{'USE'} =~ m/\bimage\b/i) {
233 $imgfile = "$local_output_dir/$href";
234 my $full_imgfile = "$resource_output_dir/$href";
235 pageimage_data_api($htid,$seq,$full_imgfile);
236 }
237 elsif ($file->{'USE'} =~ m/\bocr\b/i) {
238 $txtfile = "$local_output_dir/$href";
239 my $full_txtfile = "$resource_output_dir/$href";
240 pageocr_data_api($htid,$seq,$full_txtfile);
241 }
242 }
243 # Generate line along the following lines
244
245 # <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
246 print PIOUT " " x ($depth+1), "<Page ";
247 print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
248 print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
249 print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
250 print PIOUT "/>\n";
251
252 }
253
254 # Now process any child divs
255
256 my $div_entry = $this_div->{'METS:div'};
257
258 if (defined $div_entry) {
259
260 my $div_array = undef;
261
262 if (ref $div_entry eq "HASH") {
263 # upgrade single entry to array
264 $div_array = [ $div_entry ];
265 }
266 else {
267 $div_array = $div_entry;
268 }
269
270 foreach my $div_hash (@$div_array) {
271
272 my $pagenum = $div_hash->{'ORDER'};
273
274 rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir);
275 }
276 }
277
278
279 print PIOUT " " x $depth, "</$elem_name>\n";
280
281}
282
283sub generate_paged_image_structure
284{
285 my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
286
287 print STDERR "Generating PageImage file: $ofilename\n";
288
289 my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
290 if (!-d $resource_output_dir) {
291 mkdir $resource_output_dir;
292 }
293
294 if (open(PIOUT,">$ofilename")) {
295 binmode(PIOUT,":utf8");
296
297 rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir);
298
299 close(PIOUT);
300 }
301 else {
302 print STDERR "Error: Failed to open $ofilename for output\n";
303 print STDERR " $!\n";
304 }
305
306
307
308}
309
310
311sub download_ht_doc
312{
313 my ($cat_key,$htid,$ofilename) = @_;
314
315 my $json_data = json_structure_data_api($htid,$ofilename);
316
317 # Map in the IDs from:
318 # METS:mets->METS:fileSec->METS:fileGrp
319
320 my $file_sec_ids = {};
321
322 my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
323
324# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
325
326 foreach my $file_grp (@$file_grp_array) {
327
328 my $use = $file_grp->{'USE'};
329
330 my $file_entry = $file_grp->{'METS:file'};
331
332 my $file_array = undef;
333
334 if (ref $file_entry eq "HASH") {
335 # upgrade single entry into array
336 $file_array = [ $file_entry ];
337 }
338 else {
339 $file_array = $file_entry;
340 }
341
342# print "**** num files = ", scalar(@$file_array), "\n";
343
344 foreach my $file_hash (@$file_array) {
345 # push file_grp USE attribute down into each file entry (to make file easier later on)
346 $file_hash->{'USE'} = $use;
347
348 my $file_id = $file_hash->{'ID'};
349 $file_sec_ids->{$file_id} = $file_hash;
350
351# print "file id = $file_id\n";
352 }
353
354 }
355
356 # METS:mets->METS:structMap->{nested METS:div}+
357
358 my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
359 my $toplevel_div = $struct_map_array->{'METS:div'};
360
361 my $pi_filename = $ofilename;
362 $pi_filename =~ s/_structure\.json$/_item.xml/;
363
364 generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
365
366
367## print "**** json_content = $json_content_utf8\n\n";
368
369 exit 0;
370
371}
372
373sub read_json_file
374{
375 my ($filename) = @_;
376
377 print STDERR "+ Proccessing file: $filename\n";
378
379 my $json_file_content = "";
380 open(JSON_FILE, "<$filename");
381 binmode(JSON_FILE,":utf8");
382
383 my $line;
384 while (defined ($line=<JSON_FILE>)) {
385 $json_file_content .= $line;
386 }
387
388 close(JSON_FILE);
389
390 my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
391 my $json_data = decode_json $json_file_content_utf8;
392
393 my $record_hash = $json_data->{'records'};
394 my @record_keys = keys %$record_hash;
395 my $primary_cat_key = shift @record_keys;
396
397 my $items_array = $json_data->{'items'};
398 my $num_items = scalar(@$items_array);
399
400 my $num_pd = 0;
401
402 foreach my $item (@$items_array) {
403
404 my $htid = $item->{'htid'};
405 my $rights_code = $item->{'rightsCode'};
406
407# print "htid = $htid\n";
408# print "Rights code = $rights_code\n" if defined $rights_code;
409
410 if (defined($rights_code) && ($rights_code eq "pd")) {
411 # in the public domain
412 $num_pd++;
413
414 my $htid_safe = uri_escape($htid);
415
416 my $ofilename = $filename;
417 $ofilename =~ s/\.json/_structure.json/;
418
419 download_ht_doc($primary_cat_key,$htid,$ofilename);
420
421 # bail out at first public domain version of document
422 last;
423 }
424 }
425
426# if ($num_pd==0) {
427# print "++ $num_items item(s)\n";
428# }
429# else {
430# print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
431# }
432
433}
434
435
436sub process_dir
437{
438 my ($full_dir) = @_;
439
440# print "Processing directory: $full_dir\n";
441
442 if (opendir(DIN, $full_dir)) {
443 my @dir_content = grep { $_ !~ m/^\./ } readdir(DIN);
444 closedir DIN;
445
446 foreach my $df (@dir_content) {
447 my $full_df = "$full_dir/$df";
448 if (-d $full_df) {
449 my $full_sub_dir = $full_df;
450 process_dir($full_sub_dir);
451 }
452 else {
453 # file
454 my $full_file = $full_df;
455 if ($full_file =~ m/\.json$/) {
456 read_json_file($full_file);
457 }
458 }
459 }
460
461 }
462 else {
463
464 print STDERR "Error: Failed to open directory: $full_dir\n";
465 print STDERR " $!\n";
466 }
467
468}
469
470
471sub main
472{
473 my ($argv_ref) = @_;
474
475 my $toplevel_dir = shift @$argv_ref || "output";
476
477
478 $toplevel_dir =~ s/\/$//; # remove any trailing /
479
480 process_dir($toplevel_dir);
481
482}
483
484main(\@ARGV);
Note: See TracBrowser for help on using the repository browser.