Changeset 24159
- Timestamp:
- 2011-06-14T20:39:16+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r23754 r24159 261 261 $self->read_file ($conv_filename, "utf8", "", \$text); 262 262 263 # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for 264 # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>, 265 # which it then splits on to generate page-based sections. However, that's not what PDFBox 266 # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it 267 # embeds each page in an extra div. The div opener is: 268 # <div style=\"page-break-before:always; page-break-after:always\"> 269 # The PDFPlugin now looks for this and prefix <a name=0></a> to each such div. (The 270 # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during 271 # a regex substitution even with regex extensions on.) Later, when we process each section 272 # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter 273 # that increments the pagenum for each subsequent section. 274 275 #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">"; 276 my $loopcounter = 0; # used later on! 277 $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g; 278 279 263 280 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc 264 281 # for each page). Metadata based on this calculation not set until process() … … 267 284 # be useful to give an indication of document length in browser through setting 268 285 # num_pages as metadata. 269 my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);286 my @pages = ($text =~ m/(\<[Aa] name=\"?\w+\"?>|\<div style=\"page-break-before:always; page-break-after:always\">)/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?> 270 287 my $num_pages = scalar(@pages); 271 288 $self->{'num_pages'} = $num_pages; … … 317 334 318 335 $title = $1; # Greenstone does magic if sections are titled digits 336 337 # A title of pagenum=0 means use_sections is being applied on output from PDFBox, 338 # which didn't originally have a <a name=incremented pagenumber></a> to split each page. 339 # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here: 340 if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox 341 $title = ++$loopcounter; 342 } 343 319 344 if (! defined($title) ) { 320 345 print STDERR "no title: $section\n"; … … 323 348 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n"; 324 349 $newsection .= "<Metadata name=\"Title\">" . $title 325 . "</Metadata>\n-->< p>\n";350 . "</Metadata>\n--><br />\n"; 326 351 $newsection .= $section; 327 352 $newsection .= "<!--</Section>-->\n";
Note:
See TracChangeset
for help on using the changeset viewer.