Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl@ 26436

Last change on this file since 26436 was 26436, checked in by davidb, 11 years ago
Initial cut at code for exporting content out of the Hathitrust, suitable for ingest by Greenstone
File size: 11.5 KB

Line
1	#!/usr/bin/perl -w
2
3	use strict;
4	no strict 'refs'; # allow filehandles to be variables and viceversa
5
6	use warnings;
7
8	use Encode;
9	use JSON;
10
11	# use LWP;
12
13	use OAuth::Lite::Consumer;
14	use OAuth::Lite::AuthMethod;
15
16	use URI::Escape;
17
18	sub _data_api
19	{
20	my ($mode,$htid,$opt_seq,$opt_params) = @_;
21
22	my $access_key = '7e6ee38bae';
23	my $secret_key = 'e0429c0394385486249b4a230702';
24
25	my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";
26
27	$request_url .= "/$opt_seq" if (defined $opt_seq);
28
29	my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
30	'consumer_secret' => $secret_key,
31	'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );
32
33	my $response = $consumer->request( 'method' => 'GET',
34	'url' => $request_url,
35	'params' => $opt_params );
36
37	if (!$response->is_success()) {
38	print STDERR "**** Failed to retrieval any content from URL:\n";
39	print STDERR " ", $consumer->oauth_request->uri, "\n";
40	print "------\n";
41	print STDERR "**** Status: ", $response->status_line, "\n";
42	print "------\n";
43	print STDERR "**** Content: ", $response->content, "\n";
44	print "------\n";
45
46	$response = undef;
47	}
48
49	return $response;
50	}
51
52
53	sub pageimage_data_api
54	{
55	my ($htid,$seq_num,$ofilename) = @_;
56
57	if (!-f $ofilename) {
58	print STDERR "Downloading PageImage $htid/$seq_num\n";
59
60	my $response = _data_api("pageimage",$htid, $seq_num );
61	my $content = $response->content();
62
63	if (open(IMGOUT,">$ofilename")) {
64	binmode(IMGOUT);
65	print IMGOUT $content;
66	close(IMGOUT);
67	}
68	else {
69	print STDERR "Error: Failed to open $ofilename for binary output\n";
70	print STDERR " $!\n";
71	}
72	}
73	else {
74	print STDERR "Skipping PageImage data API request\n";
75	print STDERR "=> downloaded file $ofilename already exists\n";
76	}
77	}
78
79
80
81	sub pageocr_data_api
82	{
83	my ($htid,$seq_num,$ofilename) = @_;
84
85	my $content = undef;
86
87	if (((defined $ofilename) && (!-f $ofilename))
88	\|\| (!defined $ofilename)) {
89	print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";
90
91	my $response = _data_api("pageocr",$htid, $seq_num );
92	$content = $response->content();
93
94	if (open(TXTOUT,">$ofilename")) {
95	print TXTOUT $content;
96	close(TXTOUT);
97	}
98	else {
99	print STDERR "Error: Failed to open $ofilename for binary output\n";
100	print STDERR " $!\n";
101	}
102	}
103	else {
104	print STDERR "Skipping PageOCR Data API request\n";
105	print STDERR "=> Using cached version of file:\n $ofilename\n";
106
107	if (open(JSIN,"<$ofilename")) {
108	binmode(JSIN,":utf8");
109
110	my $line;
111	while (defined ($line=<JSIN>)) {
112	$content .= $line;
113	}
114	close(JSIN);
115	}
116	else {
117	print STDERR "Error: Failed to open cached file $ofilename for input\n";
118	print STDERR " $!\n";
119	}
120	}
121
122	return $content;
123	}
124
125	sub json_structure_data_api
126	{
127	my ($htid,$ofilename) = @_;
128
129	my $json_content = "";
130
131	if (!-f $ofilename) {
132	print STDERR "Downloading METS structure record for $htid\n";
133
134	my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
135	$json_content = $response->content();
136
137	if (open(JSOUT,">$ofilename")) {
138	binmode(JSOUT,":utf8");
139	print JSOUT $json_content;
140	close(JSOUT);
141	}
142	else {
143	print STDERR "Error: Failed to open $ofilename for output\n";
144	print STDERR " $!\n";
145	}
146
147	}
148	else {
149	print STDERR "Skipping Structure Data API request\n";
150	print STDERR "=> Using cached version of JSON structure file:\n $ofilename\n";
151
152	if (open(JSIN,"<$ofilename")) {
153	binmode(JSIN,":utf8");
154
155	my $line;
156	while (defined ($line=<JSIN>)) {
157	$json_content .= $line;
158	}
159	close(JSIN);
160	}
161	else {
162	print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
163	print STDERR " $!\n";
164	}
165	}
166
167	## print "**** $json_content\n";
168
169	my $json_content_utf8 = Encode::encode("utf8",$json_content);
170	my $json_data = decode_json $json_content_utf8;
171
172	return $json_data;
173
174
175	}
176
177
178	# Example file
179
180	#<PagedDocument>
181	# <Metadata name="Title">Matariki 1881</Metadata>
182	# <Metadata name="Date">18810423</Metadata>
183	# <Metadata name="Number">1</Metadata>
184	# <PageGroup>
185	# <Metadata name="Title">Supplementary Material</Metadata>
186	# <Page txtfile="abstracts/23__1abstract.txt">
187	# <Metadata name="Title">Abstract</Metadata>
188	# </Page>
189	# </PageGroup>
190	# <PageGroup>
191	# <Metadata name="Title">Newspaper pages</Metadata>
192	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
193	# <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
194	# <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
195	# </PageGroup>
196	#</PagedDocument>
197
198	sub rec_paged_image_structure
199	{
200	my ($this_div,$pagenum,$elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;
201
202	my ($local_output_dir) = ($resource_output_dir =~ m/^.\/(.?)$/);
203
204	print PIOUT " " x $depth, "<$elem_name>\n";
205
206	my $fptr_entry = $this_div->{'METS:fptr'};
207
208	if (defined $fptr_entry) {
209	# hit a leaf node
210
211	my $fptr_array = undef;
212
213	if (ref $fptr_entry eq "HASH") {
214	$fptr_array = [ $fptr_entry ];
215	}
216	else {
217	$fptr_array = $fptr_entry;
218	}
219
220	my $imgfile = undef;
221	my $txtfile = undef;
222
223	foreach my $fptr_hash (@$fptr_array) {
224	my $fileid = $fptr_hash->{'FILEID'};
225
226	## print STDERR "Looking up fileid = $fileid\n";
227
228	my $file = $file_id_map->{$fileid};
229	my $seq = $file->{'SEQ'};
230	my $href = $file->{'METS:FLocat'}->{'xlink:href'};
231
232	if ($file->{'USE'} =~ m/\bimage\b/i) {
233	$imgfile = "$local_output_dir/$href";
234	my $full_imgfile = "$resource_output_dir/$href";
235	pageimage_data_api($htid,$seq,$full_imgfile);
236	}
237	elsif ($file->{'USE'} =~ m/\bocr\b/i) {
238	$txtfile = "$local_output_dir/$href";
239	my $full_txtfile = "$resource_output_dir/$href";
240	pageocr_data_api($htid,$seq,$full_txtfile);
241	}
242	}
243	# Generate line along the following lines
244
245	# <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
246	print PIOUT " " x ($depth+1), "<Page ";
247	print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
248	print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
249	print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
250	print PIOUT "/>\n";
251
252	}
253
254	# Now process any child divs
255
256	my $div_entry = $this_div->{'METS:div'};
257
258	if (defined $div_entry) {
259
260	my $div_array = undef;
261
262	if (ref $div_entry eq "HASH") {
263	# upgrade single entry to array
264	$div_array = [ $div_entry ];
265	}
266	else {
267	$div_array = $div_entry;
268	}
269
270	foreach my $div_hash (@$div_array) {
271
272	my $pagenum = $div_hash->{'ORDER'};
273
274	rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir);
275	}
276	}
277
278
279	print PIOUT " " x $depth, "</$elem_name>\n";
280
281	}
282
283	sub generate_paged_image_structure
284	{
285	my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
286
287	print STDERR "Generating PageImage file: $ofilename\n";
288
289	my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
290	if (!-d $resource_output_dir) {
291	mkdir $resource_output_dir;
292	}
293
294	if (open(PIOUT,">$ofilename")) {
295	binmode(PIOUT,":utf8");
296
297	rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir);
298
299	close(PIOUT);
300	}
301	else {
302	print STDERR "Error: Failed to open $ofilename for output\n";
303	print STDERR " $!\n";
304	}
305
306
307
308	}
309
310
311	sub download_ht_doc
312	{
313	my ($cat_key,$htid,$ofilename) = @_;
314
315	my $json_data = json_structure_data_api($htid,$ofilename);
316
317	# Map in the IDs from:
318	# METS:mets->METS:fileSec->METS:fileGrp
319
320	my $file_sec_ids = {};
321
322	my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};
323
324	# print "**** num file grps = ", scalar(@$file_grp_array), "\n";
325
326	foreach my $file_grp (@$file_grp_array) {
327
328	my $use = $file_grp->{'USE'};
329
330	my $file_entry = $file_grp->{'METS:file'};
331
332	my $file_array = undef;
333
334	if (ref $file_entry eq "HASH") {
335	# upgrade single entry into array
336	$file_array = [ $file_entry ];
337	}
338	else {
339	$file_array = $file_entry;
340	}
341
342	# print "**** num files = ", scalar(@$file_array), "\n";
343
344	foreach my $file_hash (@$file_array) {
345	# push file_grp USE attribute down into each file entry (to make file easier later on)
346	$file_hash->{'USE'} = $use;
347
348	my $file_id = $file_hash->{'ID'};
349	$file_sec_ids->{$file_id} = $file_hash;
350
351	# print "file id = $file_id\n";
352	}
353
354	}
355
356	# METS:mets->METS:structMap->{nested METS:div}+
357
358	my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
359	my $toplevel_div = $struct_map_array->{'METS:div'};
360
361	my $pi_filename = $ofilename;
362	$pi_filename =~ s/_structure\.json$/_item.xml/;
363
364	generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);
365
366
367	## print "**** json_content = $json_content_utf8\n\n";
368
369	exit 0;
370
371	}
372
373	sub read_json_file
374	{
375	my ($filename) = @_;
376
377	print STDERR "+ Proccessing file: $filename\n";
378
379	my $json_file_content = "";
380	open(JSON_FILE, "<$filename");
381	binmode(JSON_FILE,":utf8");
382
383	my $line;
384	while (defined ($line=<JSON_FILE>)) {
385	$json_file_content .= $line;
386	}
387
388	close(JSON_FILE);
389
390	my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
391	my $json_data = decode_json $json_file_content_utf8;
392
393	my $record_hash = $json_data->{'records'};
394	my @record_keys = keys %$record_hash;
395	my $primary_cat_key = shift @record_keys;
396
397	my $items_array = $json_data->{'items'};
398	my $num_items = scalar(@$items_array);
399
400	my $num_pd = 0;
401
402	foreach my $item (@$items_array) {
403
404	my $htid = $item->{'htid'};
405	my $rights_code = $item->{'rightsCode'};
406
407	# print "htid = $htid\n";
408	# print "Rights code = $rights_code\n" if defined $rights_code;
409
410	if (defined($rights_code) && ($rights_code eq "pd")) {
411	# in the public domain
412	$num_pd++;
413
414	my $htid_safe = uri_escape($htid);
415
416	my $ofilename = $filename;
417	$ofilename =~ s/\.json/_structure.json/;
418
419	download_ht_doc($primary_cat_key,$htid,$ofilename);
420
421	# bail out at first public domain version of document
422	last;
423	}
424	}
425
426	# if ($num_pd==0) {
427	# print "++ $num_items item(s)\n";
428	# }
429	# else {
430	# print "++ $num_items item(s) of which $num_pd is/are in the public domain\n";
431	# }
432
433	}
434
435
436	sub process_dir
437	{
438	my ($full_dir) = @_;
439
440	# print "Processing directory: $full_dir\n";
441
442	if (opendir(DIN, $full_dir)) {
443	my @dir_content = grep { $_ !~ m/^\./ } readdir(DIN);
444	closedir DIN;
445
446	foreach my $df (@dir_content) {
447	my $full_df = "$full_dir/$df";
448	if (-d $full_df) {
449	my $full_sub_dir = $full_df;
450	process_dir($full_sub_dir);
451	}
452	else {
453	# file
454	my $full_file = $full_df;
455	if ($full_file =~ m/\.json$/) {
456	read_json_file($full_file);
457	}
458	}
459	}
460
461	}
462	else {
463
464	print STDERR "Error: Failed to open directory: $full_dir\n";
465	print STDERR " $!\n";
466	}
467
468	}
469
470
471	sub main
472	{
473	my ($argv_ref) = @_;
474
475	my $toplevel_dir = shift @$argv_ref \|\| "output";
476
477
478	$toplevel_dir =~ s/\/$//; # remove any trailing /
479
480	process_dir($toplevel_dir);
481
482	}
483
484	main(\@ARGV);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: