root/main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm @ 32215

Revision 32215, 30.5 KB (checked in by ak19, 16 months ago)

Before reorganising our PDFPlugin in whatever way we ultimately decide, committing a version where, on paged_html output mode, the pages produced by Xpdf's pdftohtml are sectionalised by default if total num pages is more than 10. Also changing inserted HTML heading tags to get the page title to still appear correctly.

  • Property svn:keywords set to Author Date Id Revision
RevLine 
[1410]1###########################################################################
2#
[15872]3# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[2661]8# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[15872]25package PDFPlugin;
[1410]26
[10353]27use strict;
[22702]28no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
[32205]29no strict 'subs'; # allow filehandles to be variables and viceversa
[1410]30
[22705]31use ReadTextFile;
32use unicode;
[32205]33use Mojo::DOM; # for HTML parsing
[22702]34
[22861]35use AutoLoadConverters;
[22864]36use ConvertBinaryFile;
[1410]37
[22861]38@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
[22705]39
40
[10452]41my $convert_to_list =
42    [ { 'name' => "auto",
[15872]43    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]44      { 'name' => "html",
[15872]45    'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]46      { 'name' => "text",
[15872]47    'desc' => "{ConvertBinaryFile.convert_to.text}" },
[32205]48      { 'name' => "paged_html",
49    'desc' => "{PDFPlugin.convert_to.paged_html}"},
[10452]50      { 'name' => "pagedimg_jpg",
[15872]51    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]52      { 'name' => "pagedimg_gif",
[15872]53    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]54      { 'name' => "pagedimg_png",
[15872]55    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]56      ];
57
58
59my $arguments =
[10889]60    [
61     { 'name' => "convert_to",
[15872]62       'desc' => "{ConvertBinaryFile.convert_to}",
[10889]63       'type' => "enum",
64       'reqd' => "yes",
65       'list' => $convert_to_list,
66       'deft' => "html" },   
67     { 'name' => "process_exp",
[31492]68       'desc' => "{BaseImporter.process_exp}",
[10889]69       'type' => "regexp",
70       'deft' => &get_default_process_exp(),
71       'reqd' => "no" },
72     { 'name' => "block_exp",
[31494]73       'desc' => "{CommonUtil.block_exp}",
[10889]74       'type' => "regexp",
75       'deft' => &get_default_block_exp() },
76     { 'name' => "metadata_fields",
[15872]77       'desc' => "{HTMLPlugin.metadata_fields}",
[10889]78       'type' => "string",
[24431]79       'deft' => "Title,Author,Subject,Keywords" },
[21800]80      { 'name' => "metadata_field_separator",
81    'desc' => "{HTMLPlugin.metadata_field_separator}",
82    'type' => "string",
83    'deft' => "" },
[10889]84     { 'name' => "noimages",
[15872]85       'desc' => "{PDFPlugin.noimages}",
[10889]86       'type' => "flag" },
87     { 'name' => "allowimagesonly",
[15872]88       'desc' => "{PDFPlugin.allowimagesonly}",
[10889]89       'type' => "flag" },
90     { 'name' => "complex",
[15872]91       'desc' => "{PDFPlugin.complex}",
[10889]92       'type' => "flag" },
93     { 'name' => "nohidden",
[15872]94       'desc' => "{PDFPlugin.nohidden}",
[10889]95       'type' => "flag" },
96     { 'name' => "zoom",
[15872]97       'desc' => "{PDFPlugin.zoom}",
[10889]98       'deft' => "2",
99       'range' => "1,3", # actually the range is 0.5-3
100       'type' => "int" },
101     { 'name' => "use_sections",
[15872]102       'desc' => "{PDFPlugin.use_sections}",
[10889]103       'type' => "flag" },
104     { 'name' => "description_tags",
[15872]105       'desc' => "{HTMLPlugin.description_tags}",
[29101]106       'type' => "flag" },
107      { 'name' => "use_realistic_book",
[29102]108        'desc' => "{PDFPlugin.use_realistic_book}",
[29101]109    'type' => "flag"}
[10889]110     ];
[3540]111
[15872]112my $options = { 'name'     => "PDFPlugin",
113        'desc'     => "{PDFPlugin.desc}",
[6408]114        'abstract' => "no",
[3540]115        'inherits' => "yes",
[15114]116        'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html     
[3540]117        'args'     => $arguments };
118
[1410]119sub new {
[10218]120    my ($class) = shift (@_);
121    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
122    push(@$pluginlist, $class);
[2452]123
[10218]124    push(@$inputargs,"-title_sub");
125    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]126
[15872]127    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128    push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]129
[22861]130    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
131    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[31492]132    my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
[10353]133   
[10580]134    if ($self->{'info_only'}) {
135    # don't worry about any options etc
136    return bless $self, $class;
137    }
[22861]138   
139    $self = bless $self, $class;
[15872]140    $self->{'file_type'} = "PDF";
141
142    # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]143    my $zoom = $self->{"zoom"};
[3720]144    $self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]145    $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
146    $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
147    $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]148    $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]149
[22597]150    # check convert_to
[32205]151    # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
[22597]152    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
153    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
154    $self->{'convert_to'} = "html";
[10273]155    }
[22597]156    elsif ($self->{'convert_to'} eq "auto") {
157    # choose html ?? is this the best option
158    $self->{'convert_to'} = "html";
[10273]159    }
[29101]160    if ($self->{'use_realistic_book'}) {
161    if ($self->{'convert_to'} ne "html") {
162        print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
163        $self->{'convert_to'} = "html";
164    }
165    }
[22597]166    # set convert_to_plugin and convert_to_ext
[22702]167    $self->set_standard_convert_settings();
[18145]168
[22597]169    my $secondary_plugin_name = $self->{'convert_to_plugin'};
170    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[12834]171
[22597]172    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
173    $secondary_plugin_options->{$secondary_plugin_name} = [];
[10724]174    }
[22597]175    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10429]176
[10273]177    # following title_sub removes "Page 1" added by pdftohtml, and a leading
178    # "1", which is often the page number at the top of the page. Bad Luck
179    # if your document title actually starts with "1 " - is there a better way?
[22597]180    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]181    my $associate_tail_re = $self->{'associate_tail_re'};
182    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[22597]183    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
[11122]184    }
[22597]185    push(@$specific_options, "-file_rename_method", "none");
186   
187    if ($secondary_plugin_name eq "HTMLPlugin") {
[22861]188    # pdftohtml always produces utf8 - What about pdfbox???
[24290]189    # push(@$specific_options, "-input_encoding", "utf8");
[22597]190    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
191    push(@$specific_options, "-processing_tmp_files");
192    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
193    # to extract these metadata fields from the HEAD META fields
194    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
195        push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
196    } else {
197        push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
198    }
199    if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
200        push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
201    }
202    if ($self->{'use_sections'} || $self->{'description_tags'}) {
203        $self->{'description_tags'} = 1;
204        push(@$specific_options, "-description_tags");
205    }
[29101]206    if ($self->{'use_realistic_book'}) {
207        push(@$specific_options, "-use_realistic_book");
208    }
[32210]209        if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section
210            push(@$specific_options, "sectionalise_using_h_tags");
211        }
[22597]212    }
213    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
214    push(@$specific_options, "-screenviewsize", "1000");
215    push(@$specific_options, "-enable_cache");
216    push(@$specific_options, "-processing_tmp_files");
217    }
[11122]218
[10273]219    $self = bless $self, $class;
[10429]220    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]221    return $self;
[1410]222}
223
224sub get_default_process_exp {
225    my $self = shift (@_);
226
227    return q^(?i)\.pdf$^;
228}
[2661]229
230# so we don't inherit HTMLPlug's block exp...
231sub get_default_block_exp {
232    return "";
233}
[22861]234
235sub init {
236    my $self = shift (@_);
237
238    # ConvertBinaryFile init
239    $self->SUPER::init(@_);
[23754]240    $self->AutoLoadConverters::init(@_);
[22861]241
242}
243
244sub begin {
245    my $self = shift (@_);
246
[23754]247    $self->AutoLoadConverters::begin(@_);
[22861]248    $self->SUPER::begin(@_);
249
250}
251
252sub deinit {
253    my $self = shift (@_);
[1410]254   
[23754]255    $self->AutoLoadConverters::deinit(@_);
[22861]256    $self->SUPER::deinit(@_);
257
258}
259 
[24290]260# By setting hashing to be on ga xml this ensures that two
261# PDF files that are identical except for the metadata
262# to hash to different values. Without this, when each PDF
263# file is converted to HTML there is a chance that they
264# will both be *identical* if the conversion utility does
265# not embed the metadata in the generated HTML. This is
266# certainly the case when PDFBOX is being used.
[22861]267
[24290]268# This change makes this convert to based plugin more
269# consistent with the original vision that the same document
270# with different metadata should
271# be seen as different.
272
273sub get_oid_hash_type {
274    my $self = shift (@_);
275    return "hash_on_ga_xml";
276}
277 
278 
[22861]279sub tmp_area_convert_file {
280
281    my $self = shift (@_);
282    return $self->AutoLoadConverters::tmp_area_convert_file(@_);
283
284}
285
[32206]286# Overriding to do some extra handling for paged_html output mode
287sub run_conversion_command {
288    my $self = shift (@_);
289    my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
290
291    if($self->{'convert_to'} ne "paged_html") {
292    return $self->ConvertBinaryFile::run_conversion_command(@_);
293    }
294
295    # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
296    # to create a subdir called "pages" in the tmp area to puts its products
297    # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
298    # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
299    # the intermediary output file tmp/<random-num>/pages/index.html should
300    # exist (besides other output products there)
301
302    # We let ConvertBinaryFile proceed normally, but the return value should reflect
303    # that on success it should expect the intermediary product tmpdir/pages/index.html
304    # (which is the product of xpdftohtml conversion).
305    my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
306    $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
307
308    # However, when convert_post_process() is done, it should have output the final
309    # product of the paged_html conversion: an html file of the same name and in the
310    # same tmp location as the input PDF file.
311
312    my ($name_prefix, $output_dir, $ext)
313    = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
314    $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
315#    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
316
317    return $output_filename;
318}
319
[10273]320sub convert_post_process
321{
[1410]322    my $self = shift (@_);
[10273]323    my ($conv_filename) = @_;
[9465]324
[7019]325    my $outhandle=$self->{'outhandle'};
326
[32206]327    if($self->{'convert_to'} eq "paged_html") {
[32205]328    # special post-processing for paged_html mode, as HTML pages generated
329    # by xpdf's pdftohtml need to be massaged into the form we want
330    $self->xpdftohtml_convert_post_process($conv_filename);
331    }
332    else { # use PDFPlugin's usual post processing
333    $self->default_convert_post_process($conv_filename);
334    }
335}
336
337# Called after gsConvert.pl has been run to convert a PDF to paged_html
338# using Xpdftools' pdftohtml
339# This method will do some cleanup of the HTML files produced after XPDF has produced
340# an HTML doc for each PDF page: it first gets rid of the default index.html.
341# Instead, it constructs a single html page containing each original HTML page
342# <body> nested as divs instead, with simple section information inserted at the top
343# of each 'page' <div> and some further styling customisation. This HTML manipulation
344# is to be done with the Mojo::DOM perl package.
345# Note that since xpdf's pdftohtml would have failed if the output dir already
346# existed and for simpler naming, the output files are created in a new "pages"
347# subdirectory of the tmp location parent of $conv_filename instead
348sub xpdftohtml_convert_post_process
349{
350    my $self = shift (@_);
[32206]351    my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
352    my $output_filename = $self->{'conv_filename_after_post_process'};
353
354    # Read in all the html files in tmp's "pages" subdir, except for index.html.
355    # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
356    # which will consist of a slightly modified version of
[32205]357    # each of the other html files concatenated together.
358
359    my $outhandle=$self->{'outhandle'};
360
[32206]361    my ($tailname, $pages_subdir, $suffix)
362    = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
[32205]363
364    # Code from util::create_itemfile()
365    # Read in all the files
366    opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
367    my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
368    closedir DIR;
369    # Sort files in the directory by page_num
370    # files are named index.html, page1.html, page2.html, ..., pagen.html
371    sub page_number {
372    my ($dir) = @_;
373    my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
374    $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
375    return $pagenum;
376    }
377    # sort the files in the directory in the order of page_num rather than lexically.
378    @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
379
380    #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
381              # For every html file there's an img file, so halve the total num.
382              # What about other file types that may potentially be there too???
383    my $num_html_pages = 0;
384    foreach my $pagefile (@page_files) {
385    $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
386    }
387
388    # Prepare to create our new html page that will contain all the individual
389    # htmls generated by xpdf's pdftohtml in sequence.
390    # First write the opening html tags out to the output file. These are the
391    # same tags and their contents, including <meta>, as is generated by
392    # Xpdf's pdftohtml for each of its individual html pages.
393    my $start_text = "<html>\n<head>\n";
[32206]394    my ($output_tailname, $tmp_subdir, $html_suffix)
395    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
396    $start_text .= "<title>$output_tailname</title>\n";
[32205]397    $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
398    $start_text .= "</head>\n<body>\n\n";
[32215]399    $start_text .= "<h1>$output_tailname</h1>\n\n";
[32205]400
401    #handle content encodings the same way that default_convert_post_process does
402    # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write   
403    # Don't want to build a giant string in memory of all the pages concatenated
404    # and then write it out in one go. Instead, build up the final single page
405    # by writing each modified paged_html file out to it as this is processed.
406    # Copying file open/close code from CommonUtil::utf8_write_file()
407    if (!open (OUTFILE, ">:utf8", $output_filename)) {
408    gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);
409    die "\n";
410    }
411    print OUTFILE $start_text;
412
413    # Get the contents of each individual HTML page generated by Xpdf, after first
414    # modifying each, and write each out into our single all-encompassing html
415    foreach my $pagefile (@page_files) {
416    if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
417        my $page_num = page_number($pagefile);   
418        # get full path to pagefile
419        $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
420#       print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
421        my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
422        print OUTFILE "$modified_page_contents\n\n";
423    }
424    }
425
426    # we've now created a single HTML file by concatenating (a modified version)
427    # of each paged html file
428    print OUTFILE "</body>\n</html>\n"; # write out closing tags
429    close OUTFILE; # done
430
431    # Get rid of all the htm(l) files incl index.html in the associated "pages"
432    # subdir, since we've now processed them all into a single html file
433    # one folder level up and we don't want HTMLPlugin to process all of them next.
434    &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l)
435
436    # now the tmp area should contain a single html file contain all the html pages'
437    # contents in sequence, and a "pages" subdir containing the screenshot images
438    # of each page.   
439    # HTMLPlugin will process these further in the plugin pipeline
440}
441
442# For whatever reason, most html <tags> don't get printed out in GLI
443# So when debugging, use this function to print them out as [tags] instead.
444sub _debug_print_html
445{
446    my $self = shift (@_);
447    my ($string_or_dom) = @_;
448
449    # can't seem to determine type of string with ref/reftype
450    # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
[32206]451    # Not needed, as $dom objects seem to get correctly stringified in string contexts
[32205]452    # $dom.to_string/$dom.stringify seem to get called, no need to call them
453    # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
454    my $escapedTxt = $string_or_dom;
455    $escapedTxt =~ s@\<@[@sg;
456    $escapedTxt =~ s@\>@]@sg;
457
458    print STDERR "#### $escapedTxt\n";
459}
460
461# Helper function to read in each paged_html generated by Xpdf's pdftohtml
462# then modify the html suitably using the HTML parsing functions offered by
463# Mojo::DOM, then return the modified HTML content as a string
464# See https://mojolicious.org/perldoc/Mojo/DOM
465sub _process_paged_html_page
466{
467    my $self = shift (@_);
468    my ($pagefile, $page_num, $num_html_pages) = @_;
469
470    my $text = "";
471
472    # handling content encoding the same way default_convert_post_process does
473    $self->read_file ($pagefile, "utf8", "", \$text);
474
475    my $dom = Mojo::DOM->new($text);
476
477#    $self->_debug_print_html($dom);
478
479    # there's a <style> element on the <html>, we need to shift it into the <div>
480    # tag that we'll be creating. We'll first slightly modify the <style> element
481    # store the first style element, which is the only one and in the <body>
482    # we'll later insert it as child of an all-encompassing div that we'll create
483    my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
484    # In the style tag, convert id style references to class style references
485    my $css_class = ".p".$page_num."f";
486    $page_style_tag_str =~ s@\#f@$css_class@sg;
487    my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified   
488#$self->_debug_print_html($style_element);
489
490    # need to know the image's height to set the height of the surrounding
491    # div that's to replace this page's <body>:
492    my $img_height = $dom->find('img')->[0]{height};
493
494    # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
495    # 3. Set that img tag's class=background, and change its id to background+$page_num
496    my $bg_img_tag=$dom->find('img#background')->[0];
497    my $img_src_str = $bg_img_tag->{src};
498    $img_src_str = "pages/$img_src_str";
[32206]499    $bg_img_tag->attr(src => $img_src_str); # reset
[32205]500#$self->_debug_print_html($bg_img_tag);
501    # set both class and modified id attributes in one step:
502    $bg_img_tag->attr({class => "background", id => "background".$page_num});
503#$self->_debug_print_html($bg_img_tag);
504
505    # get all the <span> nested inside <div class="txt"> elements and
506    # 1. set their class attr to be "p + page_num + id-of-the-span",
507    # 2. then delete the id, because the span ids have been reused when element
508    # ids ought to be unique. Which is why we set the modified ids to be the
509    # value of the class attribute instead
510    $dom->find('div.txt span')->each(sub {
511    $_->attr(class => "p". $page_num. $_->{id});
512    delete $_->{id};
513                     }); # both changes done in one find() operation
514#$self->_debug_print_html($dom->find('div.txt span')->last);
515
516    # Finally can create our new dom, starting with a div tag for the current page
517    # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
[32206]518#    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
519    my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
[32205]520#$self->_debug_print_html($new_dom);
521    $new_dom->at('div')->append_content($style_element)->root;
522
[32206]523
524#$self->_debug_print_html($new_dom);
525    # Copy across all the old html's body tag's child nodes into the new dom's new div tag
526    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
527#$self->_debug_print_html($new_dom);
528
529
530    # build up the outer div with the <h>tags for sectionalising
531    my $inner_div_str = $new_dom->to_string;
532
533    my $page_div = "<div id=\"page".$page_num."\">\n";
534    # Append a page range bucket heading if applicable: if we have more than 10 pages
535    # to display in the current bucket AND we're on the first page of each bucket of 10 pages.
[32205]536    # Dr Bainbridge thinks for now we need only consider PDFs where the
537    # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
538    # If number of remaining pages >= 10, then create new bucket heading
539    # e.g. "Pages 30-40"
[32206]540    if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
[32205]541    # Double-digit page numbers that start with 2
542    # i.e. 21 to 29 (and 30) should be in 21 to 30 range
543    my $start_range = $page_num - ($page_num % 10) + 1;
544    my $end_range = $page_num + 10 - ($page_num % 10);
[32215]545    $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
[32205]546    }
547
[32215]548    # No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
549    if($num_html_pages > 10) {
550        # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #" 
551        $page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";       
552    }
[32205]553
[32206]554    $page_div .= $inner_div_str;
555    $page_div .= "\n</div>";
556
[32205]557    # Finished processing a single html page of the paged_html output generated by
558    # Xpdf's pdftohtml: finished massaging that single html page into the right form
[32206]559    return $page_div;
[32205]560}
561
562# This subroutine is called to do the PDFPlugin post-processing for all cases
563# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
564sub default_convert_post_process
565{
566    my $self = shift (@_);
567    my ($conv_filename) = @_;
568    my $outhandle=$self->{'outhandle'};
569
[15963]570    #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
571    #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]572
[10273]573    # read in file ($text will be in utf8)
574    my $text = "";
[15963]575    # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
576    $self->read_file ($conv_filename, "utf8", "", \$text);
[10273]577
[24159]578    # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
579    # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
580    # which it then splits on to generate page-based sections. However, that's not what PDFBox
581    # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
582    # embeds each page in an extra div. The div opener is:
583    # <div style=\"page-break-before:always; page-break-after:always\">
[24476]584    # The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
[24159]585    # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
586    # a regex substitution even with regex extensions on.) Later, when we process each section
587    # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
588    # that increments the pagenum for each subsequent section.
589
590    #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
591    my $loopcounter = 0; # used later on!
592    $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
593
594
[10273]595    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
596    # for each page).  Metadata based on this calculation not set until process()
597    #
[24476]598    # Note: this is done even if we are not breaking the document into pages as it might
[10273]599    # be useful to give an indication of document length in browser through setting
600    # num_pages as metadata.
[30491]601    # Clean html from low and hight surrogates D800–DFFF
[30492]602    $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
[24476]603    my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
[10273]604    my $num_pages = scalar(@pages);
605    $self->{'num_pages'} = $num_pages;
606
[3411]607    if ($self->{'use_sections'}
608    && $self->{'converted_to'} eq "HTML") {
609
[15872]610    print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]611
[3614]612    # we have "<a name=1></a>" etc for each page
[8795]613    # it may be <A name=
[10273]614    my @sections = split('<[Aa] name=', $text);
[3411]615
[10273]616    my $top_section = "";
617
[7019]618    if (scalar (@sections) == 1) { #only one section - no split!
[15872]619        print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]620    } else {
[10273]621        $top_section .= shift @sections; # keep HTML header etc as top_section
[7019]622    }
623
[3411]624    # handle first section specially for title? Or all use first 100...
625   
626    my $title = $sections[0];
[8795]627    $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]628    $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
629    $title =~ s/<[^>]*>/ /g;
630    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
631    $title =~ s/^\s+//s;
632    $title =~ s/\s+$//;
633    $title =~ s/\s+/ /gs;
634    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
635    $title =~ s/^\s+//s; # in case title_sub introduced any...
636    $title = substr ($title, 0, 100);
637    $title =~ s/\s\S*$/.../;
638
[10273]639
[7019]640    if (scalar (@sections) == 1) { # no sections found
[10273]641        $top_section .= $sections[0];
[7019]642        @sections=();
643    } else {
[10273]644        $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]645    }
[3411]646
647    # add metadata per section...
648    foreach my $section (@sections) {
[8795]649        # section names are not always just digits, may be like "outline"
650        $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]651
[3411]652        $title = $1; # Greenstone does magic if sections are titled digits
[24159]653
654        # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
655        # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
656        # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
657        if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
658        $title = ++$loopcounter;
659        }
660
[3411]661        if (! defined($title) ) {
662        print STDERR "no title: $section\n";
[8795]663        $title = " "; # get rid of the undefined warning in next line
[3411]664        }
[15872]665        my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]666        $newsection .= "<Metadata name=\"Title\">" . $title
[24159]667        . "</Metadata>\n--><br />\n";
[3411]668        $newsection .= $section;
669        $newsection .= "<!--</Section>-->\n";
670        $section = $newsection;
671    }
672
[10273]673    $text=join('', ($top_section, @sections));
[3411]674    }
675
[24199]676    if ($self->{'use_sections'}
677    && $self->{'converted_to'} eq "text") {
678    print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
679    }
[22953]680
[24199]681
[22953]682    # The following should no longer be needed, now that strings
683    # read in are Unicode aware (in the Perl sense) rather than
684    # raw binary strings that just happen to be UTF-8 compliant
685
[8218]686    # turn any high bytes that aren't valid utf-8 into utf-8.
[22953]687##    unicode::ensure_utf8(\$text);
[8218]688
[10273]689    # Write it out again!
690    $self->utf8_write_file (\$text, $conv_filename);
691}
[7287]692
693
[10273]694# do plugin specific processing of doc_obj for HTML type
695sub process {
696    my $self = shift (@_);
[15872]697    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]698
[15963]699    my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]700
[8226]701    # fix up the extracted date metadata to be in Greenstone date format,
702    # and fix the capitalisation of 'date'
[8227]703    my $cursection = $doc_obj->get_top_section();
704    foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]705    $doc_obj->delete_metadata($cursection, "date", $datemeta);
706
707    # We're just interested in the date bit, not the time
[8278]708    # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
709    # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
710    # extracts the ModDate, so it is 0...
711    $datemeta =~ /(\d+)-(\d+)-(\d+)/;
712    my ($year, $month, $day) = ($1,$2,$3);
713    if (defined($year) && defined($month) && defined($day)) {
714        if ($year == 0) {next}
715        if ($year < 100) {$year += 1900} # just to be safe
716        if ($month =~ /^\d$/) {$month="0$month"} # single digit
717        if ($day =~ /^\d$/) {$day="0$day"} # single digit
718        my $date="$year$month$day";
719        $doc_obj->add_utf8_metadata($cursection, "Date", $date);
720    }
[7287]721    }
722
[24476]723    $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
[8795]724   
725    if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
[30742]726    # For gs2 we explicitly make it a paged document, cos greenstone won't get it
[8795]727    # right if any section has an empty title, or one with letters in it
[30742]728    if (&util::is_gs3()) {
729        # but for gs3, paged docs currently use image slider which is ugly if there are no images
730        $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
731    } else {
732        $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
733    }
[8795]734    }
[10273]735
[7287]736    return $result;
[1410]737}
738
7391;
Note: See TracBrowser for help on using the browser.