source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.4 KB
Line 
1###########################################################################
2#
3# PDFPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package PDFPlugin;
26
27use strict;
28no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29no strict 'subs'; # allow filehandles to be variables and viceversa
30
31use ReadTextFile;
32use unicode;
33use Mojo::DOM; # for HTML parsing
34
35use AutoLoadConverters;
36use ConvertBinaryFile;
37
38@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
39
40
41my $convert_to_list =
42 [ { 'name' => "auto",
43 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44 { 'name' => "html",
45 'desc' => "{ConvertBinaryFile.convert_to.html}" },
46 { 'name' => "text",
47 'desc' => "{ConvertBinaryFile.convert_to.text}" },
48 { 'name' => "paged_html",
49 'desc' => "{PDFPlugin.convert_to.paged_html}"},
50 { 'name' => "pagedimg_jpg",
51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
52 { 'name' => "pagedimg_gif",
53 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
54 { 'name' => "pagedimg_png",
55 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
56 ];
57
58
59my $arguments =
60 [
61 { 'name' => "convert_to",
62 'desc' => "{ConvertBinaryFile.convert_to}",
63 'type' => "enum",
64 'reqd' => "yes",
65 'list' => $convert_to_list,
66 'deft' => "html" },
67 { 'name' => "process_exp",
68 'desc' => "{BaseImporter.process_exp}",
69 'type' => "regexp",
70 'deft' => &get_default_process_exp(),
71 'reqd' => "no" },
72 { 'name' => "block_exp",
73 'desc' => "{CommonUtil.block_exp}",
74 'type' => "regexp",
75 'deft' => &get_default_block_exp() },
76 { 'name' => "metadata_fields",
77 'desc' => "{HTMLPlugin.metadata_fields}",
78 'type' => "string",
79 'deft' => "Title,Author,Subject,Keywords" },
80 { 'name' => "metadata_field_separator",
81 'desc' => "{HTMLPlugin.metadata_field_separator}",
82 'type' => "string",
83 'deft' => "" },
84 { 'name' => "noimages",
85 'desc' => "{PDFPlugin.noimages}",
86 'type' => "flag" },
87 { 'name' => "allowimagesonly",
88 'desc' => "{PDFPlugin.allowimagesonly}",
89 'type' => "flag" },
90 { 'name' => "complex",
91 'desc' => "{PDFPlugin.complex}",
92 'type' => "flag" },
93 { 'name' => "nohidden",
94 'desc' => "{PDFPlugin.nohidden}",
95 'type' => "flag" },
96 { 'name' => "zoom",
97 'desc' => "{PDFPlugin.zoom}",
98 'deft' => "2",
99 'range' => "1,3", # actually the range is 0.5-3
100 'type' => "int" },
101 { 'name' => "use_sections",
102 'desc' => "{PDFPlugin.use_sections}",
103 'type' => "flag" },
104 { 'name' => "description_tags",
105 'desc' => "{HTMLPlugin.description_tags}",
106 'type' => "flag" },
107 { 'name' => "use_realistic_book",
108 'desc' => "{PDFPlugin.use_realistic_book}",
109 'type' => "flag"}
110 ];
111
112my $options = { 'name' => "PDFPlugin",
113 'desc' => "{PDFPlugin.desc}",
114 'abstract' => "no",
115 'inherits' => "yes",
116 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
117 'args' => $arguments };
118
119sub new {
120 my ($class) = shift (@_);
121 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
122 push(@$pluginlist, $class);
123
124 push(@$inputargs,"-title_sub");
125 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
126
127 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128 push(@{$hashArgOptLists->{"OptList"}},$options);
129
130 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
131 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
132 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
133
134 if ($self->{'info_only'}) {
135 # don't worry about any options etc
136 return bless $self, $class;
137 }
138
139 $self = bless $self, $class;
140 $self->{'file_type'} = "PDF";
141
142 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
143 my $zoom = $self->{"zoom"};
144 $self->{'convert_options'} = "-pdf_zoom $zoom";
145 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
146 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
147 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
148 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
149
150 # check convert_to
151 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
152 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
153 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
154 $self->{'convert_to'} = "html";
155 }
156 elsif ($self->{'convert_to'} eq "auto") {
157 # choose html ?? is this the best option
158 $self->{'convert_to'} = "html";
159 }
160 if ($self->{'use_realistic_book'}) {
161 if ($self->{'convert_to'} ne "html") {
162 print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
163 $self->{'convert_to'} = "html";
164 }
165 }
166 # set convert_to_plugin and convert_to_ext
167 $self->set_standard_convert_settings();
168
169 my $secondary_plugin_name = $self->{'convert_to_plugin'};
170 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
171
172 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
173 $secondary_plugin_options->{$secondary_plugin_name} = [];
174 }
175 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
176
177 # following title_sub removes "Page 1" added by pdftohtml, and a leading
178 # "1", which is often the page number at the top of the page. Bad Luck
179 # if your document title actually starts with "1 " - is there a better way?
180 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
181 my $associate_tail_re = $self->{'associate_tail_re'};
182 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
183 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
184 }
185 push(@$specific_options, "-file_rename_method", "none");
186
187 if ($secondary_plugin_name eq "HTMLPlugin") {
188 # pdftohtml always produces utf8 - What about pdfbox???
189 # push(@$specific_options, "-input_encoding", "utf8");
190 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
191 push(@$specific_options, "-processing_tmp_files");
192 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
193 # to extract these metadata fields from the HEAD META fields
194 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
195 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
196 } else {
197 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
198 }
199 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
200 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
201 }
202 if ($self->{'use_sections'} || $self->{'description_tags'}) {
203 $self->{'description_tags'} = 1;
204 push(@$specific_options, "-description_tags");
205 }
206 if ($self->{'use_realistic_book'}) {
207 push(@$specific_options, "-use_realistic_book");
208 }
209 }
210 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
211 push(@$specific_options, "-screenviewsize", "1000");
212 push(@$specific_options, "-enable_cache");
213 push(@$specific_options, "-processing_tmp_files");
214 }
215
216 $self = bless $self, $class;
217 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
218 return $self;
219}
220
221sub get_default_process_exp {
222 my $self = shift (@_);
223
224 return q^(?i)\.pdf$^;
225}
226
227# so we don't inherit HTMLPlug's block exp...
228sub get_default_block_exp {
229 return "";
230}
231
232sub init {
233 my $self = shift (@_);
234
235 # ConvertBinaryFile init
236 $self->SUPER::init(@_);
237 $self->AutoLoadConverters::init(@_);
238
239}
240
241sub begin {
242 my $self = shift (@_);
243
244 $self->AutoLoadConverters::begin(@_);
245 $self->SUPER::begin(@_);
246
247}
248
249sub deinit {
250 my $self = shift (@_);
251
252 $self->AutoLoadConverters::deinit(@_);
253 $self->SUPER::deinit(@_);
254
255}
256
257# By setting hashing to be on ga xml this ensures that two
258# PDF files that are identical except for the metadata
259# to hash to different values. Without this, when each PDF
260# file is converted to HTML there is a chance that they
261# will both be *identical* if the conversion utility does
262# not embed the metadata in the generated HTML. This is
263# certainly the case when PDFBOX is being used.
264
265# This change makes this convert to based plugin more
266# consistent with the original vision that the same document
267# with different metadata should
268# be seen as different.
269
270sub get_oid_hash_type {
271 my $self = shift (@_);
272 return "hash_on_ga_xml";
273}
274
275
276sub tmp_area_convert_file {
277
278 my $self = shift (@_);
279 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
280
281}
282
283sub convert_post_process
284{
285 my $self = shift (@_);
286 my ($conv_filename) = @_;
287
288 my $outhandle=$self->{'outhandle'};
289# print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n";
290
291 if($self->{'convert_to'} eq "paged_html") {
292 # special post-processing for paged_html mode, as HTML pages generated
293 # by xpdf's pdftohtml need to be massaged into the form we want
294 $self->xpdftohtml_convert_post_process($conv_filename);
295 }
296 else { # use PDFPlugin's usual post processing
297 $self->default_convert_post_process($conv_filename);
298 }
299}
300
301# Called after gsConvert.pl has been run to convert a PDF to paged_html
302# using Xpdftools' pdftohtml
303# This method will do some cleanup of the HTML files produced after XPDF has produced
304# an HTML doc for each PDF page: it first gets rid of the default index.html.
305# Instead, it constructs a single html page containing each original HTML page
306# <body> nested as divs instead, with simple section information inserted at the top
307# of each 'page' <div> and some further styling customisation. This HTML manipulation
308# is to be done with the Mojo::DOM perl package.
309# Note that since xpdf's pdftohtml would have failed if the output dir already
310# existed and for simpler naming, the output files are created in a new "pages"
311# subdirectory of the tmp location parent of $conv_filename instead
312sub xpdftohtml_convert_post_process
313{
314 my $self = shift (@_);
315 my ($output_filename) = @_; # output_filename = tmp location + filename
316 # if a single html were generated.
317 # We just want the tmp location, append "pages", and read all the html files
318 # in except for index.html. Then we create a new html file by name
319 # $output_filename, which will consist of a slightly modified version of
320 # each of the other html files concatenated together.
321
322 my $outhandle=$self->{'outhandle'};
323
324 my ($tailname, $tmp_dir, $suffix)
325 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
326 my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages");
327
328 # Code from util::create_itemfile()
329 # Read in all the files
330 opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
331 my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
332 closedir DIR;
333 # Sort files in the directory by page_num
334 # files are named index.html, page1.html, page2.html, ..., pagen.html
335 sub page_number {
336 my ($dir) = @_;
337 my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
338 $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
339 return $pagenum;
340 }
341 # sort the files in the directory in the order of page_num rather than lexically.
342 @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
343
344 #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
345 # For every html file there's an img file, so halve the total num.
346 # What about other file types that may potentially be there too???
347 my $num_html_pages = 0;
348 foreach my $pagefile (@page_files) {
349 $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
350 }
351
352 # Prepare to create our new html page that will contain all the individual
353 # htmls generated by xpdf's pdftohtml in sequence.
354 # First write the opening html tags out to the output file. These are the
355 # same tags and their contents, including <meta>, as is generated by
356 # Xpdf's pdftohtml for each of its individual html pages.
357 my $start_text = "<html>\n<head>\n";
358 $start_text .= "<title>$tailname</title>\n";
359 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
360 $start_text .= "</head>\n<body>\n\n";
361
362 #handle content encodings the same way that default_convert_post_process does
363 # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
364 # Don't want to build a giant string in memory of all the pages concatenated
365 # and then write it out in one go. Instead, build up the final single page
366 # by writing each modified paged_html file out to it as this is processed.
367 # Copying file open/close code from CommonUtil::utf8_write_file()
368 if (!open (OUTFILE, ">:utf8", $output_filename)) {
369 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);
370 die "\n";
371 }
372 print OUTFILE $start_text;
373
374 # Get the contents of each individual HTML page generated by Xpdf, after first
375 # modifying each, and write each out into our single all-encompassing html
376 foreach my $pagefile (@page_files) {
377 if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
378 my $page_num = page_number($pagefile);
379 # get full path to pagefile
380 $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
381# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
382 my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
383 print OUTFILE "$modified_page_contents\n\n";
384 }
385 }
386
387 # we've now created a single HTML file by concatenating (a modified version)
388 # of each paged html file
389 print OUTFILE "</body>\n</html>\n"; # write out closing tags
390 close OUTFILE; # done
391
392 # Get rid of all the htm(l) files incl index.html in the associated "pages"
393 # subdir, since we've now processed them all into a single html file
394 # one folder level up and we don't want HTMLPlugin to process all of them next.
395# my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;
396 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
397
398 # now the tmp area should contain a single html file contain all the html pages'
399 # contents in sequence, and a "pages" subdir containing the screenshot images
400 # of each page.
401 # HTMLPlugin will process these further in the plugin pipeline
402}
403
404# For whatever reason, most html <tags> don't get printed out in GLI
405# So when debugging, use this function to print them out as [tags] instead.
406sub _debug_print_html
407{
408 my $self = shift (@_);
409 my ($string_or_dom) = @_;
410
411 # can't seem to determine type of string with ref/reftype
412 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
413
414 # $dom objects appear to get correctly stringified in string contexts
415 # $dom.to_string/$dom.stringify seem to get called, no need to call them
416 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
417 my $escapedTxt = $string_or_dom;
418 $escapedTxt =~ s@\<@[@sg;
419 $escapedTxt =~ s@\>@]@sg;
420
421 print STDERR "#### $escapedTxt\n";
422}
423
424# Helper function to read in each paged_html generated by Xpdf's pdftohtml
425# then modify the html suitably using the HTML parsing functions offered by
426# Mojo::DOM, then return the modified HTML content as a string
427# See https://mojolicious.org/perldoc/Mojo/DOM
428sub _process_paged_html_page
429{
430 my $self = shift (@_);
431 my ($pagefile, $page_num, $num_html_pages) = @_;
432
433 my $text = "";
434
435 # handling content encoding the same way default_convert_post_process does
436 $self->read_file ($pagefile, "utf8", "", \$text);
437
438 my $dom = Mojo::DOM->new($text);
439
440# $self->_debug_print_html($dom);
441
442 # there's a <style> element on the <html>, we need to shift it into the <div>
443 # tag that we'll be creating. We'll first slightly modify the <style> element
444 # store the first style element, which is the only one and in the <body>
445 # we'll later insert it as child of an all-encompassing div that we'll create
446# my $page_style_tag_str = $dom->find('style')->[0]->to_string;
447# my $page_style_tag_str = $dom->find('html style')->[0]->to_string;
448 my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
449 # In the style tag, convert id style references to class style references
450 my $css_class = ".p".$page_num."f";
451 $page_style_tag_str =~ s@\#f@$css_class@sg;
452 my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
453#$self->_debug_print_html($style_element);
454
455 # need to know the image's height to set the height of the surrounding
456 # div that's to replace this page's <body>:
457 my $img_height = $dom->find('img')->[0]{height};
458
459
460 # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it
461 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
462 # 3. Set that img tag's class=background, and change its id to background+$page_num
463 my $bg_img_tag=$dom->find('img#background')->[0];
464
465 my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}
466 $img_style_str = $img_style_str." z-index=-1;";
467#print STDERR "img_style_str: " . $img_style_str."\n";
468 my $img_src_str = $bg_img_tag->{src};
469 $img_src_str = "pages/$img_src_str";
470 $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset
471#$self->_debug_print_html($bg_img_tag);
472 # set both class and modified id attributes in one step:
473 $bg_img_tag->attr({class => "background", id => "background".$page_num});
474#$self->_debug_print_html($bg_img_tag);
475
476 # get all the <span> nested inside <div class="txt"> elements and
477 # 1. set their class attr to be "p + page_num + id-of-the-span",
478 # 2. then delete the id, because the span ids have been reused when element
479 # ids ought to be unique. Which is why we set the modified ids to be the
480 # value of the class attribute instead
481 $dom->find('div.txt span')->each(sub {
482 $_->attr(class => "p". $page_num. $_->{id});
483 delete $_->{id};
484 }); # both changes done in one find() operation
485#$self->_debug_print_html($dom->find('div.txt span')->last);
486
487 # Finally can create our new dom, starting with a div tag for the current page
488 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
489 my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" );
490#$self->_debug_print_html($new_dom);
491 $new_dom->at('div')->append_content($style_element)->root;
492
493 # Append a page range bucket heading if applicable
494 # Dr Bainbridge thinks for now we need only consider PDFs where the
495 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
496 # If number of remaining pages >= 10, then create new bucket heading
497 # e.g. "Pages 30-40"
498 if(($num_html_pages - $page_num) > 10) {
499 # Double-digit page numbers that start with 2
500 # i.e. 21 to 29 (and 30) should be in 21 to 30 range
501 my $start_range = $page_num - ($page_num % 10) + 1;
502 my $end_range = $page_num + 10 - ($page_num % 10);
503 if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range
504 $start_range -= 10;
505 $end_range -= 10;
506 }
507 $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root;
508 }
509
510 # Add a simpler heading: just the pagenumber, "Page #"
511 $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root;
512#$self->_debug_print_html($new_dom);
513 # Copy across all the old html's body tag's child nodes into the new dom's new div tag
514 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
515#$self->_debug_print_html($new_dom);
516
517 # Finished processing a single html page of the paged_html output generated by
518 # Xpdf's pdftohtml: finished massaging that single html page into the right form
519 return $new_dom->to_string;
520}
521
522# This subroutine is called to do the PDFPlugin post-processing for all cases
523# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
524sub default_convert_post_process
525{
526 my $self = shift (@_);
527 my ($conv_filename) = @_;
528 my $outhandle=$self->{'outhandle'};
529
530 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
531 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
532
533 # read in file ($text will be in utf8)
534 my $text = "";
535 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
536 $self->read_file ($conv_filename, "utf8", "", \$text);
537
538 # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
539 # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
540 # which it then splits on to generate page-based sections. However, that's not what PDFBox
541 # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
542 # embeds each page in an extra div. The div opener is:
543 # <div style=\"page-break-before:always; page-break-after:always\">
544 # The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
545 # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
546 # a regex substitution even with regex extensions on.) Later, when we process each section
547 # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
548 # that increments the pagenum for each subsequent section.
549
550 #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
551 my $loopcounter = 0; # used later on!
552 $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
553
554
555 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
556 # for each page). Metadata based on this calculation not set until process()
557 #
558 # Note: this is done even if we are not breaking the document into pages as it might
559 # be useful to give an indication of document length in browser through setting
560 # num_pages as metadata.
561 # Clean html from low and hight surrogates D800–DFFF
562 $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
563 my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
564 my $num_pages = scalar(@pages);
565 $self->{'num_pages'} = $num_pages;
566
567 if ($self->{'use_sections'}
568 && $self->{'converted_to'} eq "HTML") {
569
570 print $outhandle "PDFPlugin: Calculating sections...\n";
571
572 # we have "<a name=1></a>" etc for each page
573 # it may be <A name=
574 my @sections = split('<[Aa] name=', $text);
575
576 my $top_section = "";
577
578 if (scalar (@sections) == 1) { #only one section - no split!
579 print $outhandle "PDFPlugin: warning - no sections found\n";
580 } else {
581 $top_section .= shift @sections; # keep HTML header etc as top_section
582 }
583
584 # handle first section specially for title? Or all use first 100...
585
586 my $title = $sections[0];
587 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
588 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
589 $title =~ s/<[^>]*>/ /g;
590 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
591 $title =~ s/^\s+//s;
592 $title =~ s/\s+$//;
593 $title =~ s/\s+/ /gs;
594 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
595 $title =~ s/^\s+//s; # in case title_sub introduced any...
596 $title = substr ($title, 0, 100);
597 $title =~ s/\s\S*$/.../;
598
599
600 if (scalar (@sections) == 1) { # no sections found
601 $top_section .= $sections[0];
602 @sections=();
603 } else {
604 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
605 }
606
607 # add metadata per section...
608 foreach my $section (@sections) {
609 # section names are not always just digits, may be like "outline"
610 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
611
612 $title = $1; # Greenstone does magic if sections are titled digits
613
614 # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
615 # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
616 # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
617 if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
618 $title = ++$loopcounter;
619 }
620
621 if (! defined($title) ) {
622 print STDERR "no title: $section\n";
623 $title = " "; # get rid of the undefined warning in next line
624 }
625 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
626 $newsection .= "<Metadata name=\"Title\">" . $title
627 . "</Metadata>\n--><br />\n";
628 $newsection .= $section;
629 $newsection .= "<!--</Section>-->\n";
630 $section = $newsection;
631 }
632
633 $text=join('', ($top_section, @sections));
634 }
635
636 if ($self->{'use_sections'}
637 && $self->{'converted_to'} eq "text") {
638 print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
639 }
640
641
642 # The following should no longer be needed, now that strings
643 # read in are Unicode aware (in the Perl sense) rather than
644 # raw binary strings that just happen to be UTF-8 compliant
645
646 # turn any high bytes that aren't valid utf-8 into utf-8.
647## unicode::ensure_utf8(\$text);
648
649 # Write it out again!
650 $self->utf8_write_file (\$text, $conv_filename);
651}
652
653
654# do plugin specific processing of doc_obj for HTML type
655sub process {
656 my $self = shift (@_);
657 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
658
659 my $result = $self->process_type($base_dir,$file,$doc_obj);
660
661 # fix up the extracted date metadata to be in Greenstone date format,
662 # and fix the capitalisation of 'date'
663 my $cursection = $doc_obj->get_top_section();
664 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
665 $doc_obj->delete_metadata($cursection, "date", $datemeta);
666
667 # We're just interested in the date bit, not the time
668 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
669 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
670 # extracts the ModDate, so it is 0...
671 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
672 my ($year, $month, $day) = ($1,$2,$3);
673 if (defined($year) && defined($month) && defined($day)) {
674 if ($year == 0) {next}
675 if ($year < 100) {$year += 1900} # just to be safe
676 if ($month =~ /^\d$/) {$month="0$month"} # single digit
677 if ($day =~ /^\d$/) {$day="0$day"} # single digit
678 my $date="$year$month$day";
679 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
680 }
681 }
682
683 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
684
685 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
686 # For gs2 we explicitly make it a paged document, cos greenstone won't get it
687 # right if any section has an empty title, or one with letters in it
688 if (&util::is_gs3()) {
689 # but for gs3, paged docs currently use image slider which is ugly if there are no images
690 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
691 } else {
692 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
693 }
694 }
695
696 return $result;
697}
698
6991;
Note: See TracBrowser for help on using the repository browser.