Changeset 26442
- Timestamp:
- 2012-11-06T02:50:22+13:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/hathitrust-downloadfrom/trunk/htc-get-pd-docs.pl
r26436 r26442 41 41 print STDERR "**** Status: ", $response->status_line, "\n"; 42 42 print "------\n"; 43 print STDERR "**** Content: ", $response->content, "\n"; 43 my $text_only_content = $response->content(); 44 $text_only_content =~ s/<[^>]*>//g; 45 $text_only_content =~ s/^\s*$//mg; 46 47 print STDERR "**** Content: $text_only_content\n"; 44 48 print "------\n"; 45 49 … … 58 62 print STDERR "Downloading PageImage $htid/$seq_num\n"; 59 63 64 my $retryCount = 0; 65 PageImageRetry: 60 66 my $response = _data_api("pageimage",$htid, $seq_num ); 61 my $content = $response->content(); 62 63 if (open(IMGOUT,">$ofilename")) { 64 binmode(IMGOUT); 65 print IMGOUT $content; 66 close(IMGOUT); 67 } 68 else { 69 print STDERR "Error: Failed to open $ofilename for binary output\n"; 70 print STDERR " $!\n"; 71 } 67 if (defined $response) { 68 $retryCount = 0; # reset it 69 my $content = $response->content(); 70 71 if (open(IMGOUT,">$ofilename")) { 72 binmode(IMGOUT); 73 print IMGOUT $content; 74 close(IMGOUT); 75 } 76 else { 77 print STDERR "Error: Failed to open $ofilename for binary output\n"; 78 print STDERR " $!\n"; 79 } 80 } 81 else { 82 $retryCount++; 83 print STDERR "Failed to download PageImage\n"; 84 85 if ($retryCount<2) { 86 print STDERR "Sleeping to 60 seconds\n"; 87 sleep(60); 88 print STDERR "Retry attempt $retryCount\n"; 89 goto PageImageRetry; 90 } 91 else { 92 print STDERR "Maximum number of attempts reached. Stopping.\n"; 93 exit -1; 94 } 95 } 96 72 97 } 73 98 else { … … 88 113 || (!defined $ofilename)) { 89 114 print STDERR "Downloading PageOCR (text) $htid/$seq_num\n"; 115 116 my $retryCount = 0; 117 PageOcrRetry: 118 119 my $response = _data_api("pageocr",$htid, $seq_num ); 120 121 if (defined $response) { 122 $retryCount = 0; # reset it 123 124 $content = $response->content(); 90 125 91 my $response = _data_api("pageocr",$htid, $seq_num ); 92 $content = $response->content(); 93 94 if (open(TXTOUT,">$ofilename")) { 95 print TXTOUT $content; 96 close(TXTOUT); 97 } 98 else { 99 print STDERR "Error: Failed to open $ofilename for binary output\n"; 100 print STDERR " $!\n"; 101 } 126 if (open(TXTOUT,">$ofilename")) { 127 print TXTOUT $content; 128 close(TXTOUT); 129 } 130 else { 131 print STDERR "Error: Failed to open $ofilename for binary output\n"; 132 print STDERR " $!\n"; 133 } 134 } 135 else { 136 $retryCount++; 137 print STDERR "Failed to download PageOCR\n"; 138 139 if ($retryCount<2) { 140 print STDERR "Sleeping to 60 seconds\n"; 141 sleep(60); 142 print STDERR "Retry attempt $retryCount\n"; 143 goto PageOcrRetry; 144 } 145 else { 146 print STDERR "Maximum number of attempts reached. Stopping.\n"; 147 exit -1; 148 } 149 } 150 102 151 } 103 152 else { … … 198 247 sub rec_paged_image_structure 199 248 { 200 my ($this_div,$pagenum,$ elem_name,$depth,$htid,$file_id_map,$resource_output_dir) = @_;249 my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_; 201 250 202 251 my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/); 203 252 204 print PIOUT " " x $depth, "<$elem_name>\n";205 253 206 254 my $fptr_entry = $this_div->{'METS:fptr'}; 255 256 if (defined $this_div->{'METS:div'}) { 257 # Only want Greenstones <PageGroup> tag if not a METS leaf div 258 print PIOUT " " x $depth, "<PageGroup>\n"; 259 } 207 260 208 261 if (defined $fptr_entry) { … … 220 273 my $imgfile = undef; 221 274 my $txtfile = undef; 275 222 276 223 277 foreach my $fptr_hash (@$fptr_array) { … … 229 283 my $seq = $file->{'SEQ'}; 230 284 my $href = $file->{'METS:FLocat'}->{'xlink:href'}; 285 231 286 232 287 if ($file->{'USE'} =~ m/\bimage\b/i) { … … 240 295 pageocr_data_api($htid,$seq,$full_txtfile); 241 296 } 297 242 298 } 243 299 # Generate line along the following lines … … 268 324 } 269 325 326 print STDERR "+ Processing ", scalar(@$div_array), " sections\n"; 327 270 328 foreach my $div_hash (@$div_array) { 271 329 272 330 my $pagenum = $div_hash->{'ORDER'}; 273 331 274 rec_paged_image_structure($div_hash,$pagenum,"PageGroup",$depth+1,$htid,$file_id_map,$resource_output_dir); 275 } 276 } 277 278 279 print PIOUT " " x $depth, "</$elem_name>\n"; 332 rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir); 333 } 334 } 335 336 if (defined $this_div->{'METS:div'}) { 337 # Only want Greenstones <PageGroup> tag if not a METS leaf div 338 print PIOUT " " x $depth, "</PageGroup>\n"; 339 } 340 280 341 281 342 } … … 294 355 if (open(PIOUT,">$ofilename")) { 295 356 binmode(PIOUT,":utf8"); 296 297 rec_paged_image_structure($toplevel_div,1,"PageDocument",0,$htid,$file_id_map,$resource_output_dir); 357 358 print PIOUT "<PagedDocument>\n"; 359 # print PIOUT " <PageGroup>\n"; 360 361 rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir); 362 363 # print PIOUT " </PageGroup>\n"; 364 print PIOUT "</PagedDocument>\n"; 298 365 299 366 close(PIOUT); … … 308 375 } 309 376 377 378 my $pdCount = 0; 310 379 311 380 sub download_ht_doc … … 367 436 ## print "**** json_content = $json_content_utf8\n\n"; 368 437 369 exit 0; 438 $pdCount++; 439 440 # if ($pdCount>5) { 441 # exit 0; 442 # } 370 443 371 444 } … … 395 468 my $primary_cat_key = shift @record_keys; 396 469 397 my $items_array = $json_data->{'items'}; 470 my $items_entry = $json_data->{'items'}; 471 my $items_array; 472 473 print STDERR "*** ref: ", ref $items_entry, "\n\n"; 474 475 476 if (ref $items_entry eq "HASH") { 477 $items_array = [ $items_entry ]; 478 } 479 else { 480 $items_array = $items_entry; 481 } 482 398 483 my $num_items = scalar(@$items_array); 399 484 … … 441 526 442 527 if (opendir(DIN, $full_dir)) { 443 my @dir_content = grep { $_ !~ m/^\./ } readdir(DIN);528 my @dir_content = grep { $_ !~ m/^\./ } sort readdir(DIN); 444 529 closedir DIN; 445 530
Note:
See TracChangeset
for help on using the changeset viewer.