Ignore:
Timestamp:
2011-06-14T20:39:16+12:00 (13 years ago)
Author:
ak19
Message:

Implemented use_sections for PDFBox, needed to grab the (at this moment) latest version of the PDFBox pre-built binary jar file: pdfbox-app-1.5.0.jar

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r23754 r24159  
    261261    $self->read_file ($conv_filename, "utf8", "", \$text);
    262262
     263    # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
     264    # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
     265    # which it then splits on to generate page-based sections. However, that's not what PDFBox
     266    # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
     267    # embeds each page in an extra div. The div opener is:
     268    # <div style=\"page-break-before:always; page-break-after:always\">
     269    # The PDFPlugin now looks for this and prefix <a name=0></a> to each such div. (The
     270    # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
     271    # a regex substitution even with regex extensions on.) Later, when we process each section
     272    # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
     273    # that increments the pagenum for each subsequent section.
     274
     275    #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
     276    my $loopcounter = 0; # used later on!
     277    $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
     278
     279
    263280    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
    264281    # for each page).  Metadata based on this calculation not set until process()
     
    267284    # be useful to give an indication of document length in browser through setting
    268285    # num_pages as metadata.
    269     my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
     286    my @pages = ($text =~ m/(\<[Aa] name=\"?\w+\"?>|\<div style=\"page-break-before:always; page-break-after:always\">)/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
    270287    my $num_pages = scalar(@pages);
    271288    $self->{'num_pages'} = $num_pages;
     
    317334
    318335        $title = $1; # Greenstone does magic if sections are titled digits
     336
     337        # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
     338        # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
     339        # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
     340        if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
     341        $title = ++$loopcounter;
     342        }
     343
    319344        if (! defined($title) ) {
    320345        print STDERR "no title: $section\n";
     
    323348        my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
    324349        $newsection .= "<Metadata name=\"Title\">" . $title
    325         . "</Metadata>\n--><p>\n";
     350        . "</Metadata>\n--><br />\n";
    326351        $newsection .= $section;
    327352        $newsection .= "<!--</Section>-->\n";
Note: See TracChangeset for help on using the changeset viewer.