source: main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm@ 32283

Last change on this file since 32283 was 32283, checked in by ak19, 6 years ago

More stable behaviour by PDFv2Plugin: 1. when pdfbox_conversion is on, but an output option supported by xpdftools is selected, it now uses xpdftools anyway instead of attempting to use pdfbox_conversion. 2. when pdfbox_conversion is not on and an output format that it alone supports (and not supported by xpdftools) is selected, a warning message is displayed that xpdftools will be used to output to a fallback output format and that the user to switch on pdfbox_conversion otherwise. This message was present and displayed in a recent commit, but the behaviour was not set up yet then. In future, we may solve this differently if it's decided that PDFBoxConverter is not an AutoLoadConverter and will therefore be always available with PDFv2Plugin (but what about GS2, where PDFBox is an optional extension?)

File size: 32.9 KB
Line 
1###########################################################################
2#
3# PDFv2Plugin.pm -- pdf plugin that uses xpdftools or, if switched on,
4# pdfbox, to process PDFs.
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFv2Plugin;
27
28use strict;
29no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
30no strict 'subs'; # allow filehandles to be variables and viceversa
31
32use ReadTextFile;
33use unicode;
34use Mojo::DOM; # for HTML parsing
35
36use AutoLoadConverters;
37use ConvertBinaryFile;
38
39@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
40
41
42my $convert_to_list =
43 [ { 'name' => "auto",
44 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
45 { 'name' => "text", # xpdftools
46 'desc' => "{ConvertBinaryFile.convert_to.text}" },
47 { 'name' => "paged_text", # xpdftools
48 'desc' => "{ConvertBinaryFile.convert_to.paged_text}" },
49
50 { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs
51 'desc' => "{PDFPlugin.convert_to.html}" },
52 { 'name' => "pretty_html", # xpdftools
53 'desc' => "{PDFPlugin.convert_to.pretty_html}" },
54 { 'name' => "paged_pretty_html", # xpdftools
55 'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"},
56
57 #pdfbox
58 { 'name' => "pagedimg_jpg",
59 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
60 { 'name' => "pagedimg_png",
61 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
62
63 { 'name' => "pagedimgtxt_jpg",
64 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_jpg}"},
65 { 'name' => "pagedimgtxt_png",
66 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_png}"},
67 ];
68
69
70my $arguments =
71 [
72 { 'name' => "convert_to",
73 'desc' => "{ConvertBinaryFile.convert_to}",
74 'type' => "enum",
75 'reqd' => "yes",
76 'list' => $convert_to_list,
77 'deft' => "pretty_html" },
78 { 'name' => "process_exp",
79 'desc' => "{BaseImporter.process_exp}",
80 'type' => "regexp",
81 'deft' => &get_default_process_exp(),
82 'reqd' => "no" },
83 { 'name' => "block_exp",
84 'desc' => "{CommonUtil.block_exp}",
85 'type' => "regexp",
86 'deft' => &get_default_block_exp() },
87 { 'name' => "metadata_fields",
88 'desc' => "{HTMLPlugin.metadata_fields}",
89 'type' => "string",
90 'deft' => "Title,Author,Subject,Keywords" },
91 { 'name' => "metadata_field_separator",
92 'desc' => "{HTMLPlugin.metadata_field_separator}",
93 'type' => "string",
94 'deft' => "" },
95# { 'name' => "noimages",
96# 'desc' => "{PDFPlugin.noimages}",
97# 'type' => "flag" },
98# { 'name' => "allowimagesonly",
99# 'desc' => "{PDFPlugin.allowimagesonly}",
100# 'type' => "flag" },
101# { 'name' => "complex",
102# 'desc' => "{PDFPlugin.complex}",
103# 'type' => "flag" },
104# { 'name' => "nohidden",
105# 'desc' => "{PDFPlugin.nohidden}",
106# 'type' => "flag" },
107 { 'name' => "zoom",
108 'desc' => "{PDFv2Plugin.zoom}",
109 'deft' => "1",
110 'type' => "string" }, # xpdftools' zoom takes fractions
111# { 'name' => "use_sections",
112# 'desc' => "{PDFPlugin.use_sections}",
113# 'type' => "flag" },
114# { 'name' => "description_tags",
115# 'desc' => "{HTMLPlugin.description_tags}",
116# 'type' => "flag" },
117 { 'name' => "use_realistic_book",
118 'desc' => "{PDFPlugin.use_realistic_book}",
119 'type' => "flag"}
120 ];
121
122my $options = { 'name' => "PDFv2Plugin",
123 'desc' => "{PDFPlugin.desc}",
124 'abstract' => "no",
125 'inherits' => "yes",
126 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
127 'args' => $arguments };
128
129sub new {
130 my ($class) = shift (@_);
131 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
132 push(@$pluginlist, $class);
133
134 push(@$inputargs,"-title_sub");
135 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
136
137 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138 push(@{$hashArgOptLists->{"OptList"}},$options);
139
140 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
141 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
142 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
143
144 if ($self->{'info_only'}) {
145 # don't worry about any options etc
146 return bless $self, $class;
147 }
148
149 $self = bless $self, $class;
150 $self->{'file_type'} = "PDF";
151
152 # convert_options are passed through to gsConvert.pl by ConvertBinaryFile.pm
153
154 # the most important option is the tool that's used to do the conversion
155 $self->{'convert_options'} = "-pdf_tool xpdftools"; # default for PDFv2Plugin. If pdfbox_conversion is on, the pdfbpox GS extension sets pdf_tool to pdfbox
156
157 # pdf_zoom is supported by xpdftools' pdftohtml. So for pretty_html and paged_pretty_html
158 my $zoom = $self->{"zoom"};
159 $self->{'convert_options'} .= " -pdf_zoom $zoom";
160
161 # PDFv2Plugin now supports PDF to txt conversion on Windows too:
162 # using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
163 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
164 &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.win_pdftotext_info}\n");
165 }
166 elsif ($self->{'convert_to'} eq "auto") {
167 # choose pretty_html is the best default option when using xpdftools
168 $self->{'convert_to'} = "pretty_html";
169 }
170 if ($self->{'use_realistic_book'}) {
171 if ($self->{'convert_to'} ne "html") {
172 print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
173 $self->{'convert_to'} = "html";
174 }
175 }
176
177 # if pdfbox_conversion is not on, check convert_to to make sure that xpdftools can
178 # support the selected output format, or fallback on a sensible default
179 # Not all available conversion output options are possible with xpdftools, as some are
180 # only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html
181 if (!$self->{"pdfbox_conversion"}) {
182 my $convert_to = $self->{'convert_to'};
183 my $fallback_convert_to = $convert_to;
184 if($convert_to =~ /^html$/) {
185 $fallback_convert_to = "pretty_html";
186 }
187 elsif ($self->{'convert_to'} =~ /^pagedimg/) {
188 $fallback_convert_to = "paged_pretty_html";
189 }
190 elsif ($self->{'convert_to'} =~ /^paged_text$/) {
191 # print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
192 $fallback_convert_to = "text";
193 }
194
195 if($convert_to =~ /^(html|pagedimg|paged_text)/) {
196 &gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));
197 $self->{'convert_to'} = $fallback_convert_to;
198 }
199 }
200
201 # set convert_to_plugin and convert_to_ext
202 $self->set_standard_convert_settings();
203
204 my $secondary_plugin_name = $self->{'convert_to_plugin'};
205 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
206
207 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
208 $secondary_plugin_options->{$secondary_plugin_name} = [];
209 }
210 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
211
212 # following title_sub removes "Page 1" added by pdftohtml, and a leading
213 # "1", which is often the page number at the top of the page. Bad Luck
214 # if your document title actually starts with "1 " - is there a better way?
215 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
216 my $associate_tail_re = $self->{'associate_tail_re'};
217 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
218 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
219 }
220 push(@$specific_options, "-file_rename_method", "none");
221
222 if ($secondary_plugin_name eq "HTMLPlugin") {
223 # pdftohtml always produces utf8 - What about pdfbox???
224 # push(@$specific_options, "-input_encoding", "utf8");
225 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
226 push(@$specific_options, "-processing_tmp_files");
227 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
228 # to extract these metadata fields from the HEAD META fields
229 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
230 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
231 } else {
232 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
233 }
234 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
235 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
236 }
237 if ($self->{'use_sections'} || $self->{'description_tags'}) {
238 $self->{'description_tags'} = 1;
239 push(@$specific_options, "-description_tags");
240 }
241 if ($self->{'use_realistic_book'}) {
242 push(@$specific_options, "-use_realistic_book");
243 }
244 if($self->{'convert_to'} eq "paged_pretty_html") { # for paged pretty html, the default should be to sectionalise
245 # the single superpage, the one containing divs representing individual pages as sections, on headings
246 push(@$specific_options, "sectionalise_using_h_tags");
247 }
248 }
249 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
250 push(@$specific_options, "-screenviewsize", "1000");
251 push(@$specific_options, "-enable_cache");
252 push(@$specific_options, "-processing_tmp_files");
253 }
254
255 $self = bless $self, $class;
256 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
257 return $self;
258}
259
260sub get_default_process_exp {
261 my $self = shift (@_);
262
263 return q^(?i)\.pdf$^;
264}
265
266# so we don't inherit HTMLPlug's block exp...
267sub get_default_block_exp {
268 return "";
269}
270
271sub init {
272 my $self = shift (@_);
273
274 # ConvertBinaryFile init
275 $self->SUPER::init(@_);
276 $self->AutoLoadConverters::init(@_);
277
278}
279
280sub begin {
281 my $self = shift (@_);
282
283 $self->AutoLoadConverters::begin(@_);
284 $self->SUPER::begin(@_);
285
286}
287
288sub deinit {
289 my $self = shift (@_);
290
291 $self->AutoLoadConverters::deinit(@_);
292 $self->SUPER::deinit(@_);
293
294}
295
296# By setting hashing to be on ga xml this ensures that two
297# PDF files that are identical except for the metadata
298# to hash to different values. Without this, when each PDF
299# file is converted to HTML there is a chance that they
300# will both be *identical* if the conversion utility does
301# not embed the metadata in the generated HTML. This is
302# certainly the case when PDFBOX is being used.
303
304# This change makes this convert to based plugin more
305# consistent with the original vision that the same document
306# with different metadata should
307# be seen as different.
308
309sub get_oid_hash_type {
310 my $self = shift (@_);
311 return "hash_on_ga_xml";
312}
313
314
315sub tmp_area_convert_file {
316
317 my $self = shift (@_);
318
319 if($self->{'convert_to'} =~ m/pretty_html$/) { # if outputting paged_pretty_html or pretty_html:
320 # only xpdftools can output pretty_html regardless of whether pdfbox_conversion is switched on
321 print STDERR "@@@@ PDFBox_conversion is switched on, but pretty_html variants are generated by xpdftools.\n";
322 return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
323 }
324 # else, output format uses pdfbox:
325 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
326
327}
328
329# Overriding to do some extra handling for pretty_html/paged_pretty_html output mode
330sub run_conversion_command {
331 my $self = shift (@_);
332 my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
333
334 if($self->{'convert_to'} !~ m/pretty_html$/) {
335 return $self->ConvertBinaryFile::run_conversion_command(@_);
336 }
337
338 # else, paged_pretty_html or pretty_html
339
340 # if output mode is (paged_)pretty_html, we use Xpdf tools' pdftohtml and tell it
341 # to create a subdir called "pages" in the tmp area to puts its products
342 # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
343 # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
344 # the intermediary output file tmp/<random-num>/pages/index.html should
345 # exist (besides other output products there)
346
347 # We let ConvertBinaryFile proceed normally, but the return value should reflect
348 # that on success it should expect the intermediary product tmpdir/pages/index.html
349 # (which is the product of xpdftohtml conversion).
350 my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
351 $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
352
353 # However, when convert_post_process() is done, it should have output the final
354 # product of the (paged_)pretty_html conversion: an html file of the same name and in the
355 # same tmp location as the input PDF file.
356
357 my ($name_prefix, $output_dir, $ext)
358 = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
359 $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
360# print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
361
362 return $output_filename;
363}
364
365sub convert_post_process
366{
367 my $self = shift (@_);
368 my ($conv_filename) = @_;
369
370 my $outhandle=$self->{'outhandle'};
371
372 if($self->{'convert_to'} =~ /pretty_html/) { # (paged_)pretty_html
373 # special post-processing for (paged_)pretty_html mode, as HTML pages generated
374 # by xpdf's pdftohtml need to be massaged into the form we want
375 $self->xpdftohtml_convert_post_process($conv_filename);
376 }
377 else { # use PDFPlugin's usual post processing
378 $self->default_convert_post_process($conv_filename);
379 }
380}
381
382# Called after gsConvert.pl has been run to convert a PDF to (paged_)pretty_html
383# using Xpdftools' pdftohtml
384# This method will do some cleanup of the HTML files produced after XPDF has produced
385# an HTML doc for each PDF page: it first gets rid of the default index.html.
386# Instead, it constructs a single html page containing each original HTML page
387# <body> nested as divs instead, with simple section information inserted at the top
388# of each 'page' <div> and some further styling customisation. This HTML manipulation
389# is to be done with the Mojo::DOM perl package.
390# Note that since xpdf's pdftohtml would have failed if the output dir already
391# existed and for simpler naming, the output files are created in a new "pages"
392# subdirectory of the tmp location parent of $conv_filename instead
393sub xpdftohtml_convert_post_process
394{
395 my $self = shift (@_);
396 my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for (paged_)pretty_html output mode
397 my $output_filename = $self->{'conv_filename_after_post_process'};
398
399 # Read in all the html files in tmp's "pages" subdir, except for index.html.
400 # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
401 # which will consist of a slightly modified version of
402 # each of the other html files concatenated together.
403
404 my $outhandle=$self->{'outhandle'};
405
406 my ($tailname, $pages_subdir, $suffix)
407 = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
408
409 # Code from util::create_itemfile()
410 # Read in all the files
411 opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
412 my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
413 closedir DIR;
414 # Sort files in the directory by page_num
415 # files are named index.html, page1.html, page2.html, ..., pagen.html
416 sub page_number {
417 my ($dir) = @_;
418 my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
419 $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
420 return $pagenum;
421 }
422 # sort the files in the directory in the order of page_num rather than lexically.
423 @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
424
425 #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
426 # For every html file there's an img file, so halve the total num.
427 # What about other file types that may potentially be there too???
428 my $num_html_pages = 0;
429 foreach my $pagefile (@page_files) {
430 $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
431 }
432
433 # Prepare to create our new html page that will contain all the individual
434 # htmls generated by xpdf's pdftohtml in sequence.
435 # First write the opening html tags out to the output file. These are the
436 # same tags and their contents, including <meta>, as is generated by
437 # Xpdf's pdftohtml for each of its individual html pages.
438 my $start_text = "<html>\n<head>\n";
439 my ($output_tailname, $tmp_subdir, $html_suffix)
440 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
441 $start_text .= "<title>$output_tailname</title>\n";
442 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
443 $start_text .= "</head>\n<body>\n\n";
444
445 if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the <h>tags for sectionalising
446 $start_text .= "<h1>$output_tailname</h1>\n\n";
447 }
448
449 #handle content encodings the same way that default_convert_post_process does
450 # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
451 # Don't want to build a giant string in memory of all the pages concatenated
452 # and then write it out in one go. Instead, build up the final single page
453 # by writing each modified (paged_)pretty_html file out to it as this is processed.
454 # Copying file open/close code from CommonUtil::utf8_write_file()
455 if (!open (OUTFILE, ">:utf8", $output_filename)) {
456 gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);
457 die "\n";
458 }
459 print OUTFILE $start_text;
460
461 # Get the contents of each individual HTML page generated by Xpdf, after first
462 # modifying each, and write each out into our single all-encompassing html
463 foreach my $pagefile (@page_files) {
464 if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
465 my $page_num = page_number($pagefile);
466 # get full path to pagefile
467 $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
468# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
469 my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages);
470 print OUTFILE "$modified_page_contents\n\n";
471 }
472 }
473
474 # we've now created a single HTML file by concatenating (a modified version)
475 # of each paged html file
476 print OUTFILE "</body>\n</html>\n"; # write out closing tags
477 close OUTFILE; # done
478
479 # Get rid of all the htm(l) files incl index.html in the associated "pages"
480 # subdir, since we've now processed them all into a single html file
481 # one folder level up and we don't want HTMLPlugin to process all of them next.
482 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
483
484 # now the tmp area should contain a single html file contain all the html pages'
485 # contents in sequence, and a "pages" subdir containing the screenshot images
486 # of each page.
487 # HTMLPlugin will process these further in the plugin pipeline
488}
489
490# For whatever reason, most html <tags> don't get printed out in GLI
491# So when debugging, use this function to print them out as [tags] instead.
492sub _debug_print_html
493{
494 my $self = shift (@_);
495 my ($string_or_dom) = @_;
496
497 # can't seem to determine type of string with ref/reftype
498 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
499 # Not needed, as $dom objects seem to get correctly stringified in string contexts
500 # $dom.to_string/$dom.stringify seem to get called, no need to call them
501 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
502 my $escapedTxt = $string_or_dom;
503 $escapedTxt =~ s@\<@[@sg;
504 $escapedTxt =~ s@\>@]@sg;
505
506 print STDERR "#### $escapedTxt\n";
507}
508
509# Helper function for (paged_)pretty_html
510# to read in each page of pretty_html generated by Xpdf's pdftohtml
511# then modify the html suitably using the HTML parsing functions offered by
512# Mojo::DOM, then return the modified HTML content as a string.
513# For paged_pretty_html, some additional modification is done to sectionalise the final html
514# See https://mojolicious.org/perldoc/Mojo/DOM
515sub _process_pretty_html_page
516{
517 my $self = shift (@_);
518 my ($pagefile, $page_num, $num_html_pages) = @_;
519
520 my $text = "";
521
522 # handling content encoding the same way default_convert_post_process does
523 $self->read_file ($pagefile, "utf8", "", \$text);
524
525 my $dom = Mojo::DOM->new($text);
526
527# $self->_debug_print_html($dom);
528
529 # there's a <style> element on the <html>, we need to shift it into the <div>
530 # tag that we'll be creating. We'll first slightly modify the <style> element
531 # store the first style element, which is the only one and in the <body>
532 # we'll later insert it as child of an all-encompassing div that we'll create
533 my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
534 # In the style tag, convert id style references to class style references
535 my $css_class = ".p".$page_num."f";
536 $page_style_tag_str =~ s@\#f@$css_class@sg;
537 my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
538#$self->_debug_print_html($style_element);
539
540 # need to know the image's height to set the height of the surrounding
541 # div that's to replace this page's <body>:
542 my $img_height = $dom->find('img')->[0]{height};
543
544 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
545 # 3. Set that img tag's class=background, and change its id to background+$page_num
546 my $bg_img_tag=$dom->find('img#background')->[0];
547 my $img_src_str = $bg_img_tag->{src};
548 $img_src_str = "pages/$img_src_str";
549 $bg_img_tag->attr(src => $img_src_str); # reset
550#$self->_debug_print_html($bg_img_tag);
551 # set both class and modified id attributes in one step:
552 $bg_img_tag->attr({class => "background", id => "background".$page_num});
553#$self->_debug_print_html($bg_img_tag);
554
555 # get all the <span> nested inside <div class="txt"> elements and
556 # 1. set their class attr to be "p + page_num + id-of-the-span",
557 # 2. then delete the id, because the span ids have been reused when element
558 # ids ought to be unique. Which is why we set the modified ids to be the
559 # value of the class attribute instead
560 $dom->find('div.txt span')->each(sub {
561 $_->attr(class => "p". $page_num. $_->{id});
562 delete $_->{id};
563 }); # both changes done in one find() operation
564#$self->_debug_print_html($dom->find('div.txt span')->last);
565
566 # Finally can create our new dom, starting with a div tag for the current page
567 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
568# my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
569 my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
570#$self->_debug_print_html($new_dom);
571 $new_dom->at('div')->append_content($style_element)->root;
572
573
574#$self->_debug_print_html($new_dom);
575 # Copy across all the old html's body tag's child nodes into the new dom's new div tag
576 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
577#$self->_debug_print_html($new_dom);
578
579 # build up the outer div
580 my $inner_div_str = $new_dom->to_string;
581 my $page_div = "<div id=\"page".$page_num."\">\n";
582
583 if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the <h>tags for sectionalising
584
585 # Append a page range bucket heading if applicable: if we have more than 10 pages
586 # to display in the current bucket AND we're on the first page of each bucket of 10 pages.
587 # Dr Bainbridge thinks for now we need only consider PDFs where the
588 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
589 # If number of remaining pages >= 10, then create new bucket heading
590 # e.g. "Pages 30-40"
591 if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
592 # Double-digit page numbers that start with 2
593 # i.e. 21 to 29 (and 30) should be in 21 to 30 range
594 my $start_range = $page_num - ($page_num % 10) + 1;
595 my $end_range = $page_num + 10 - ($page_num % 10);
596 $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
597 }
598
599 # No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
600 if($num_html_pages > 10) {
601 # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
602 $page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";
603 }
604 }
605
606 $page_div .= $inner_div_str;
607 $page_div .= "\n</div>";
608
609 # Finished processing a single html page of the (paged_)pretty_html output generated by
610 # Xpdf's pdftohtml: finished massaging that single html page into the right form
611 return $page_div;
612}
613
614# This subroutine is called to do the PDFv2Plugin post-processing for all cases
615# except the "pretty_html" or "paged_pretty_html" conversion modes.
616# This is what PDFPlugin always used to do:
617sub default_convert_post_process
618{
619 my $self = shift (@_);
620 my ($conv_filename) = @_;
621 my $outhandle=$self->{'outhandle'};
622
623 #$self->{'input_encoding'} = "utf8"; # TODO: The output is always in utf8 (is it?? it is for html, but what about other types?)
624 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
625
626 # read in file ($text will be in utf8)
627 my $text = "";
628 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
629 $self->read_file ($conv_filename, "utf8", "", \$text);
630
631 # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
632 # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
633 # which it then splits on to generate page-based sections. However, that's not what PDFBox
634 # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
635 # embeds each page in an extra div. The div opener is:
636 # <div style=\"page-break-before:always; page-break-after:always\">
637 # The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
638 # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
639 # a regex substitution even with regex extensions on.) Later, when we process each section
640 # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
641 # that increments the pagenum for each subsequent section.
642
643 #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
644 my $loopcounter = 0; # used later on!
645 $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
646
647
648 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
649 # for each page). Metadata based on this calculation not set until process()
650 #
651 # Note: this is done even if we are not breaking the document into pages as it might
652 # be useful to give an indication of document length in browser through setting
653 # num_pages as metadata.
654 # Clean html from low and hight surrogates D800–DFFF
655 $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
656 my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
657 my $num_pages = scalar(@pages);
658 $self->{'num_pages'} = $num_pages;
659
660 if ($self->{'use_sections'}
661 && $self->{'converted_to'} eq "HTML") {
662
663 print $outhandle "PDFv2Plugin: Calculating sections...\n";
664
665 # we have "<a name=1></a>" etc for each page
666 # it may be <A name=
667 my @sections = split('<[Aa] name=', $text);
668
669 my $top_section = "";
670
671 if (scalar (@sections) == 1) { #only one section - no split!
672 print $outhandle "PDFv2Plugin: warning - no sections found\n";
673 } else {
674 $top_section .= shift @sections; # keep HTML header etc as top_section
675 }
676
677 # handle first section specially for title? Or all use first 100...
678
679 my $title = $sections[0];
680 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
681 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
682 $title =~ s/<[^>]*>/ /g;
683 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
684 $title =~ s/^\s+//s;
685 $title =~ s/\s+$//;
686 $title =~ s/\s+/ /gs;
687 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
688 $title =~ s/^\s+//s; # in case title_sub introduced any...
689 $title = substr ($title, 0, 100);
690 $title =~ s/\s\S*$/.../;
691
692
693 if (scalar (@sections) == 1) { # no sections found
694 $top_section .= $sections[0];
695 @sections=();
696 } else {
697 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
698 }
699
700 # add metadata per section...
701 foreach my $section (@sections) {
702 # section names are not always just digits, may be like "outline"
703 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
704
705 $title = $1; # Greenstone does magic if sections are titled digits
706
707 # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
708 # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
709 # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
710 if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
711 $title = ++$loopcounter;
712 }
713
714 if (! defined($title) ) {
715 print STDERR "no title: $section\n";
716 $title = " "; # get rid of the undefined warning in next line
717 }
718 my $newsection = "<!-- from PDFv2Plugin -->\n<!-- <Section>\n";
719 $newsection .= "<Metadata name=\"Title\">" . $title
720 . "</Metadata>\n--><br />\n";
721 $newsection .= $section;
722 $newsection .= "<!--</Section>-->\n";
723 $section = $newsection;
724 }
725
726 $text=join('', ($top_section, @sections));
727 }
728
729 if ($self->{'use_sections'}
730 && $self->{'converted_to'} eq "text") {
731 print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
732 }
733
734
735 # The following should no longer be needed, now that strings
736 # read in are Unicode aware (in the Perl sense) rather than
737 # raw binary strings that just happen to be UTF-8 compliant
738
739 # turn any high bytes that aren't valid utf-8 into utf-8.
740## unicode::ensure_utf8(\$text);
741
742 # Write it out again!
743 $self->utf8_write_file (\$text, $conv_filename);
744}
745
746
747# do plugin specific processing of doc_obj for HTML type
748sub process {
749 my $self = shift (@_);
750 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
751
752 my $result = $self->process_type($base_dir,$file,$doc_obj);
753
754 # fix up the extracted date metadata to be in Greenstone date format,
755 # and fix the capitalisation of 'date'
756 my $cursection = $doc_obj->get_top_section();
757 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
758 $doc_obj->delete_metadata($cursection, "date", $datemeta);
759
760 # We're just interested in the date bit, not the time
761 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
762 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
763 # extracts the ModDate, so it is 0...
764 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
765 my ($year, $month, $day) = ($1,$2,$3);
766 if (defined($year) && defined($month) && defined($day)) {
767 if ($year == 0) {next}
768 if ($year < 100) {$year += 1900} # just to be safe
769 if ($month =~ /^\d$/) {$month="0$month"} # single digit
770 if ($day =~ /^\d$/) {$day="0$day"} # single digit
771 my $date="$year$month$day";
772 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
773 }
774 }
775
776 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
777
778 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
779 # For gs2 we explicitly make it a paged document, cos greenstone won't get it
780 # right if any section has an empty title, or one with letters in it
781 if (&util::is_gs3()) {
782 # but for gs3, paged docs currently use image slider which is ugly if there are no images
783 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
784 } else {
785 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
786 }
787 }
788
789 return $result;
790}
791
7921;
Note: See TracBrowser for help on using the repository browser.