Changeset 32206 for main/trunk/greenstone2/perllib/plugins
- Timestamp:
- 2018-06-22T22:04:16+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugins
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r32205 r32206 349 349 $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type"; 350 350 } 351 } elsif ($output_type eq "paged_html") {352 $output_filename =~ s/$lc_suffix$/.html/;353 351 } else { 354 352 $output_filename =~ s/$lc_suffix$/.$output_type/; … … 374 372 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline 375 373 376 # We used to return -1 here if $conv_filename didn't exist at this stage377 # However, for "paged_html" convert_to mode, the converted HTML file $conv_filename378 # will only be created from conversion products *after* convert_post_process() returns379 374 my $output_type=$self->{'convert_to'}; 380 if ( $output_type ne "paged_html" && ! -e "$conv_filename") {return -1;}375 if (!&FileUtils::fileExists($conv_filename)) {return -1;} 381 376 $self->{'conv_filename'} = $conv_filename; 382 377 $self->convert_post_process($conv_filename); 383 if ($output_type eq "paged_html" && ! -e "$conv_filename") {return -1;} 378 379 # Check if, after post-processing, the final expected output file has changed 380 # And if it has, check that the final output file now exists after post processing 381 if(defined $self->{'conv_filename_after_post_process'}) { 382 $conv_filename = $self->{'conv_filename_after_post_process'}; 383 if (!&FileUtils::fileExists($conv_filename)) {return -1;} 384 } 384 385 385 386 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r32205 r32206 281 281 } 282 282 283 # Overriding to do some extra handling for paged_html output mode 284 sub run_conversion_command { 285 my $self = shift (@_); 286 my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_; 287 288 if($self->{'convert_to'} ne "paged_html") { 289 return $self->ConvertBinaryFile::run_conversion_command(@_); 290 } 291 292 # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it 293 # to create a subdir called "pages" in the tmp area to puts its products 294 # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory 295 # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run, 296 # the intermediary output file tmp/<random-num>/pages/index.html should 297 # exist (besides other output products there) 298 299 # We let ConvertBinaryFile proceed normally, but the return value should reflect 300 # that on success it should expect the intermediary product tmpdir/pages/index.html 301 # (which is the product of xpdftohtml conversion). 302 my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_); 303 $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html"); 304 305 # However, when convert_post_process() is done, it should have output the final 306 # product of the paged_html conversion: an html file of the same name and in the 307 # same tmp location as the input PDF file. 308 309 my ($name_prefix, $output_dir, $ext) 310 = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$"); 311 $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html"); 312 # print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n"; 313 314 return $output_filename; 315 } 316 283 317 sub convert_post_process 284 318 { … … 287 321 288 322 my $outhandle=$self->{'outhandle'}; 289 # print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n"; 290 291 if($self->{'convert_to'} eq "paged_html") { 323 324 if($self->{'convert_to'} eq "paged_html") { 292 325 # special post-processing for paged_html mode, as HTML pages generated 293 326 # by xpdf's pdftohtml need to be massaged into the form we want … … 313 346 { 314 347 my $self = shift (@_); 315 my ($output_filename) = @_; # output_filename = tmp location + filename 316 # if a single html were generated. 317 # We just want the tmp location, append "pages", and read all the html files 318 # in except for index.html. Then we create a new html file by name 319 # $output_filename, which will consist of a slightly modified version of 348 my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode 349 my $output_filename = $self->{'conv_filename_after_post_process'}; 350 351 # Read in all the html files in tmp's "pages" subdir, except for index.html. 352 # and use it to create a new html file called $self->{'conv_filename_after_post_process'} 353 # which will consist of a slightly modified version of 320 354 # each of the other html files concatenated together. 321 355 322 356 my $outhandle=$self->{'outhandle'}; 323 357 324 my ($tailname, $tmp_dir, $suffix) 325 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 326 my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages"); 358 my ($tailname, $pages_subdir, $suffix) 359 = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$"); 327 360 328 361 # Code from util::create_itemfile() … … 356 389 # Xpdf's pdftohtml for each of its individual html pages. 357 390 my $start_text = "<html>\n<head>\n"; 358 $start_text .= "<title>$tailname</title>\n"; 391 my ($output_tailname, $tmp_subdir, $html_suffix) 392 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 393 $start_text .= "<title>$output_tailname</title>\n"; 359 394 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"; 360 395 $start_text .= "</head>\n<body>\n\n"; … … 393 428 # subdir, since we've now processed them all into a single html file 394 429 # one folder level up and we don't want HTMLPlugin to process all of them next. 395 # my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;396 430 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l) 397 431 … … 411 445 # can't seem to determine type of string with ref/reftype 412 446 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable 413 414 # $dom objects appear to get correctly stringified in string contexts 447 # Not needed, as $dom objects seem to get correctly stringified in string contexts 415 448 # $dom.to_string/$dom.stringify seem to get called, no need to call them 416 449 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl … … 444 477 # store the first style element, which is the only one and in the <body> 445 478 # we'll later insert it as child of an all-encompassing div that we'll create 446 # my $page_style_tag_str = $dom->find('style')->[0]->to_string;447 # my $page_style_tag_str = $dom->find('html style')->[0]->to_string;448 479 my $page_style_tag_str = $dom->at('html')->at('style')->to_string; 449 480 # In the style tag, convert id style references to class style references … … 457 488 my $img_height = $dom->find('img')->[0]{height}; 458 489 459 460 # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it461 490 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs 462 491 # 3. Set that img tag's class=background, and change its id to background+$page_num 463 492 my $bg_img_tag=$dom->find('img#background')->[0]; 464 465 my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}466 $img_style_str = $img_style_str." z-index=-1;";467 #print STDERR "img_style_str: " . $img_style_str."\n";468 493 my $img_src_str = $bg_img_tag->{src}; 469 494 $img_src_str = "pages/$img_src_str"; 470 $bg_img_tag->attr( {style => $img_style_str, src => $img_src_str}); # reset495 $bg_img_tag->attr(src => $img_src_str); # reset 471 496 #$self->_debug_print_html($bg_img_tag); 472 497 # set both class and modified id attributes in one step: … … 487 512 # Finally can create our new dom, starting with a div tag for the current page 488 513 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/> 489 my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ); 514 # my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ) 515 my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" ); 490 516 #$self->_debug_print_html($new_dom); 491 517 $new_dom->at('div')->append_content($style_element)->root; 492 518 493 # Append a page range bucket heading if applicable 519 520 #$self->_debug_print_html($new_dom); 521 # Copy across all the old html's body tag's child nodes into the new dom's new div tag 522 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 523 #$self->_debug_print_html($new_dom); 524 525 526 # build up the outer div with the <h>tags for sectionalising 527 my $inner_div_str = $new_dom->to_string; 528 529 my $page_div = "<div id=\"page".$page_num."\">\n"; 530 # Append a page range bucket heading if applicable: if we have more than 10 pages 531 # to display in the current bucket AND we're on the first page of each bucket of 10 pages. 494 532 # Dr Bainbridge thinks for now we need only consider PDFs where the 495 533 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...) 496 534 # If number of remaining pages >= 10, then create new bucket heading 497 535 # e.g. "Pages 30-40" 498 if(($ num_html_pages - $page_num) > 10) {536 if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) { 499 537 # Double-digit page numbers that start with 2 500 538 # i.e. 21 to 29 (and 30) should be in 21 to 30 range 501 539 my $start_range = $page_num - ($page_num % 10) + 1; 502 540 my $end_range = $page_num + 10 - ($page_num % 10); 503 if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range 504 $start_range -= 10; 505 $end_range -= 10; 506 } 507 $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root; 508 } 509 510 # Add a simpler heading: just the pagenumber, "Page #" 541 $page_div .= "<h1 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h1>\n"; 542 } 543 544 # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #" 545 $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h2>\n"; 511 546 $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root; 512 #$self->_debug_print_html($new_dom); 513 # Copy across all the old html's body tag's child nodes into the new dom's new div tag 514 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 515 #$self->_debug_print_html($new_dom); 547 548 $page_div .= $inner_div_str; 549 $page_div .= "\n</div>"; 516 550 517 551 # Finished processing a single html page of the paged_html output generated by 518 552 # Xpdf's pdftohtml: finished massaging that single html page into the right form 519 return $ new_dom->to_string;553 return $page_div; 520 554 } 521 555
Note:
See TracChangeset
for help on using the changeset viewer.