source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 11 years ago

Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone

File size: 11.5 KB
Line 
1#!/usr/bin/perl -w
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use warnings;
7
8use Encode;
9use JSON;
10
11# use LWP;
12
13use OAuth::Lite::Consumer;
14use OAuth::Lite::AuthMethod;
15
16use URI::Escape;
17
18sub _data_api
19{
20 my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22 my $access_key = '7e6ee38bae';
23 my $secret_key = 'e0429c0394385486249b4a230702';
24
25 my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27 $request_url .= "/$opt_seq" if (defined $opt_seq);
28
29 my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30 'consumer_secret' => $secret_key,
31 'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33 my $response = $consumer->request( 'method' => 'GET',
34 'url' => $request_url,
35 'params' => $opt_params );
36
37 if (!$response->is_success()) {
38 print STDERR "**** Failed to retrieval any content from URL:\n";
39 print STDERR " ", $consumer->oauth_request->uri, "\n";
40 print "------\n";
41 print STDERR "**** Status: ", $response->status_line, "\n";
42 print "------\n";
43 print STDERR "**** Content: ", $response->content, "\n";
44 print "------\n";
45
46 $response = undef;
47 }
48
49 return $response;
50}
51
52
53sub pageimage_data_api
54{
55 my ($htid,$seq_num,$ofilename) = @_;
56
57 if (!-f $ofilename) {
58 print STDERR "Downloading PageImage $htid/$seq_num\n";
59
60 my $response = _data_api("pageimage",$htid, $seq_num );
61 my $content = $response->content();
62
63 if (open(IMGOUT,">$ofilename")) {
64 binmode(IMGOUT);
65 print IMGOUT $content;
66 close(IMGOUT);
67 }
68 else {
69 print STDERR "Error: Failed to open $ofilename for binary output\n";
70 print STDERR " $!\n";
71 }
72 }
73 else {
74 print STDERR "Skipping PageImage data API request\n";
75 print STDERR "=> downloaded file $ofilename already exists\n";
76 }
77}
78
79
80
81sub pageocr_data_api
82{
83 my ($htid,$seq_num,$ofilename) = @_;
84
85 my $content = undef;
86
87 if (((defined $ofilename) && (!-f $ofilename))
88 || (!defined $ofilename)) {
89 print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
90
91 my $response = _data_api("pageocr",$htid, $seq_num );
92 $content = $response->content();
93
94 if (open(TXTOUT,">$ofilename")) {
95 print TXTOUT $content;
96 close(TXTOUT);
97 }
98 else {
99 print STDERR "Error: Failed to open $ofilename for binary output\n";
100 print STDERR " $!\n";
101 }
102 }
103 else {
104 print STDERR "Skipping PageOCR Data API request\n";
105 print STDERR "=> Using cached version of file:\n $ofilename\n";
106
107 if (open(JSIN,"<$ofilename")) {
108 binmode(JSIN,":utf8");
109
110 my $line;
111 while (defined ($line=<JSIN>)) {
112 $content .= $line;
113 }
114 close(JSIN);
115 }
116 else {
117 print STDERR "Error: Failed to open cached file $ofilename for input\n";
118 print STDERR " $!\n";
119 }
120 }
121
122 return $content;
123}
124
125sub json_structure_data_api
126{
127 my ($htid,$ofilename) = @_;
128
129 my $json_content = "";
130
131 if (!-f $ofilename) {
132 print STDERR "Downloading METS structure record for $htid\n";
133
134 my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
135 $json_content = $response->content();
136
137 if (open(JSOUT,">$ofilename")) {
138 binmode(JSOUT,":utf8");
139 print JSOUT $json_content;
140 close(JSOUT);
141 }
142 else {
143 print STDERR "Error: Failed to open $ofilename for output\n";
144 print STDERR " $!\n";
145 }
146
147 }
148 else {
149 print STDERR "Skipping Structure Data API request\n";
150 print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
151
152 if (open(JSIN,"<$ofilename")) {
153 binmode(JSIN,":utf8");
154
155 my $line;
156 while (defined ($line=<JSIN>)) {
157 $json_content .= $line;
158 }
159 close(JSIN);
160 }
161 else {
162 print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
163 print STDERR " $!\n";
164 }
165 }
166
167## print "**** $json_content\n";
168
169 my $json_content_utf8 = Encode::encode("utf8",$json_content);
170 my $json_data = decode_json $json_content_utf8;
171
172 return $json_data;
173
174
175}
176
177
178# Example file
179
180#<PagedDocument>
181# <Metadata name="Title">Matariki 1881</Metadata>
182# <Metadata name="Date">18810423</Metadata>
183# <Metadata name="Number">1</Metadata>
184# <PageGroup>
185# <Metadata name="Title">Supplementary Material</Metadata>
186# <Page txtfile="abstracts/23__1abstract.txt">
187# <Metadata name="Title">Abstract</Metadata>
188# </Page>
189# </PageGroup>
190# <PageGroup>
191# <Metadata name="Title">Newspaper pages</Metadata>
192# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
193# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
194# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
195# </PageGroup>
196#</PagedDocument>
197
198sub rec_paged_image_structure
199{
200 my ($this_div,$pagenum,$elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
201
202 my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);
203
204 print PIOUT " " x $depth, "<$elem_name>\n";
205
206 my $fptr_entry = $this_div->{'METS:fptr'};
207
208 if (defined $fptr_entry) {
209 # hit a leaf node
210
211 my $fptr_array = undef;
212
213 if (ref $fptr_entry eq "HASH") {
214 $fptr_array = [ $fptr_entry ];
215 }
216 else {
217 $fptr_array = $fptr_entry;
218 }
219
220 my $imgfile = undef;
221 my $txtfile = undef;
222
223 foreach my $fptr_hash (@$fptr_array) {
224 my $fileid = $fptr_hash->{'FILEID'};
225
226## print STDERR "Looking up fileid = $fileid\n";
227
228 my $file = $file_id_map->{$fileid};
229 my $seq = $file->{'SEQ'};
230 my $href = $file->{'METS:FLocat'}->{'xlink:href'};
231
232 if ($file->{'USE'} =~ m/\bimage\b/i) {
233 $imgfile = "$local_output_dir/$href";
234 my $full_imgfile = "$resource_output_dir/$href";
235 pageimage_data_api($htid,$seq,$full_imgfile);
236 }
237 elsif ($file->{'USE'} =~ m/\bocr\b/i) {
238 $txtfile = "$local_output_dir/$href";
239 my $full_txtfile = "$resource_output_dir/$href";
240 pageocr_data_api($htid,$seq,$full_txtfile);
241 }
242 }
243 # Generate line along the following lines
244
245 # <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
246 print PIOUT " " x ($depth+1), "<Page ";
247 print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
248 print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
249 print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
250 print PIOUT "/>\n";
251
252 }
253
254 # Now process any child divs
255
256 my $div_entry = $this_div->{'METS:div'};
257
258 if (defined $div_entry) {
259
260 my $div_array = undef;
261
262 if (ref $div_entry eq "HASH") {
263 # upgrade single entry to array
264 $div_array = [ $div_entry ];
265 }
266 else {
267 $div_array = $div_entry;
268 }
269
270 foreach my $div_hash (@$div_array) {
271
272 my $pagenum = $div_hash->{'ORDER'};
273
274 rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir);
275 }
276 }
277
278
279 print PIOUT " " x $depth, "</$elem_name>\n";
280
281}
282
283sub generate_paged_image_structure
284{
285 my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
286
287 print STDERR "Generating PageImage file: $ofilename\n";
288
289 my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
290 if (!-d $resource_output_dir) {
291 mkdir $resource_output_dir;
292 }
293
294 if (open(PIOUT,">$ofilename")) {
295 binmode(PIOUT,":utf8");
296
297 rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir);
298
299 close(PIOUT);
300 }
301 else {
302 print STDERR "Error: Failed to open $ofilename for output\n";
303 print STDERR " $!\n";
304 }
305
306
307
308}
309
310
311sub download_ht_doc
312{
313 my ($cat_key,$htid,$ofilename) = @_;
314
315 my $json_data = json_structure_data_api($htid,$ofilename);
316
317 # Map in the IDs from:
318 # METS:mets->METS:fileSec->METS:fileGrp
319
320 my $file_sec_ids = {};
321
322 my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
323
324# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
325
326 foreach my $file_grp (@$file_grp_array) {
327
328 my $use = $file_grp->{'USE'};
329
330 my $file_entry = $file_grp->{'METS:file'};
331
332 my $file_array = undef;
333
334 if (ref $file_entry eq "HASH") {
335 # upgrade single entry into array
336 $file_array = [ $file_entry ];
337 }
338 else {
339 $file_array = $file_entry;
340 }
341
342# print "**** num files = ", scalar(@$file_array), "\n";
343
344 foreach my $file_hash (@$file_array) {
345 # push file_grp USE attribute down into each file entry (to make file easier later on)
346 $file_hash->{'USE'} = $use;
347
348 my $file_id = $file_hash->{'ID'};
349 $file_sec_ids->{$file_id} = $file_hash;
350
351# print "file id = $file_id\n";
352 }
353
354 }
355
356 # METS:mets->METS:structMap->{nested METS:div}+
357
358 my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
359 my $toplevel_div = $struct_map_array->{'METS:div'};
360
361 my $pi_filename = $ofilename;
362 $pi_filename =~ s/_structure\.json$/_item.xml/;
363
364 generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
365
366
367## print "**** json_content = $json_content_utf8\n\n";
368
369 exit 0;
370
371}
372
373sub read_json_file
374{
375 my ($filename) = @_;
376
377 print STDERR "+ Proccessing file: $filename\n";
378
379 my $json_file_content = "";
380 open(JSON_FILE, "<$filename");
381 binmode(JSON_FILE,":utf8");
382
383 my $line;
384 while (defined ($line=<JSON_FILE>)) {
385 $json_file_content .= $line;
386 }
387
388 close(JSON_FILE);
389
390 my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
391 my $json_data = decode_json $json_file_content_utf8;
392
393 my $record_hash = $json_data->{'records'};
394 my @record_keys = keys %$record_hash;
395 my $primary_cat_key = shift @record_keys;
396
397 my $items_array = $json_data->{'items'};
398 my $num_items = scalar(@$items_array);
399
400 my $num_pd = 0;
401
402 foreach my $item (@$items_array) {
403
404 my $htid = $item->{'htid'};
405 my $rights_code = $item->{'rightsCode'};
406
407# print "htid = $htid\n";
408# print "Rights code = $rights_code\n" if defined $rights_code;
409
410 if (defined($rights_code) && ($rights_code eq "pd")) {
411 # in the public domain
412 $num_pd++;
413
414 my $htid_safe = uri_escape($htid);
415
416 my $ofilename = $filename;
417 $ofilename =~ s/\.json/_structure.json/;
418
419 download_ht_doc($primary_cat_key,$htid,$ofilename);
420
421 # bail out at first public domain version of document
422 last;
423 }
424 }
425
426# if ($num_pd==0) {
427# print "++ $num_items item(s)\n";
428# }
429# else {
430# print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
431# }
432
433}
434
435
436sub process_dir
437{
438 my ($full_dir) = @_;
439
440# print "Processing directory: $full_dir\n";
441
442 if (opendir(DIN, $full_dir)) {
443 my @dir_content = grep { $_ !~ m/^\./ } readdir(DIN);
444 closedir DIN;
445
446 foreach my $df (@dir_content) {
447 my $full_df = "$full_dir/$df";
448 if (-d $full_df) {
449 my $full_sub_dir = $full_df;
450 process_dir($full_sub_dir);
451 }
452 else {
453 # file
454 my $full_file = $full_df;
455 if ($full_file =~ m/\.json$/) {
456 read_json_file($full_file);
457 }
458 }
459 }
460
461 }
462 else {
463
464 print STDERR "Error: Failed to open directory: $full_dir\n";
465 print STDERR " $!\n";
466 }
467
468}
469
470
471sub main
472{
473 my ($argv_ref) = @_;
474
475 my $toplevel_dir = shift @$argv_ref || "output";
476
477
478 $toplevel_dir =~ s/\/$//; # remove any trailing /
479
480 process_dir($toplevel_dir);
481
482}
483
484main(\@ARGV);
Note: See TracBrowser for help on using the repository browser.