Context Navigation

PDFv2Plugin.pm@ 32283

Last change on this file since 32283 was 32283, checked in by ak19, 6 years ago

More stable behaviour by PDFv2Plugin: 1. when pdfbox_conversion is on, but an output option supported by xpdftools is selected, it now uses xpdftools anyway instead of attempting to use pdfbox_conversion. 2. when pdfbox_conversion is not on and an output format that it alone supports (and not supported by xpdftools) is selected, a warning message is displayed that xpdftools will be used to output to a fallback output format and that the user to switch on pdfbox_conversion otherwise. This message was present and displayed in a recent commit, but the behaviour was not set up yet then. In future, we may solve this differently if it's decided that PDFBoxConverter is not an AutoLoadConverter and will therefore be always available with PDFv2Plugin (but what about GS2, where PDFBox is an optional extension?)

File size: 32.9 KB

Line
1	###########################################################################
2	#
3	# PDFv2Plugin.pm -- pdf plugin that uses xpdftools or, if switched on,
4	# pdfbox, to process PDFs.
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2001 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	package PDFv2Plugin;
27
28	use strict;
29	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
30	no strict 'subs'; # allow filehandles to be variables and viceversa
31
32	use ReadTextFile;
33	use unicode;
34	use Mojo::DOM; # for HTML parsing
35
36	use AutoLoadConverters;
37	use ConvertBinaryFile;
38
39	@PDFv2Plugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
40
41
42	my $convert_to_list =
43	[ { 'name' => "auto",
44	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
45	{ 'name' => "text", # xpdftools
46	'desc' => "{ConvertBinaryFile.convert_to.text}" },
47	{ 'name' => "paged_text", # xpdftools
48	'desc' => "{ConvertBinaryFile.convert_to.paged_text}" },
49
50	{ 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs
51	'desc' => "{PDFPlugin.convert_to.html}" },
52	{ 'name' => "pretty_html", # xpdftools
53	'desc' => "{PDFPlugin.convert_to.pretty_html}" },
54	{ 'name' => "paged_pretty_html", # xpdftools
55	'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"},
56
57	#pdfbox
58	{ 'name' => "pagedimg_jpg",
59	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
60	{ 'name' => "pagedimg_png",
61	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
62
63	{ 'name' => "pagedimgtxt_jpg",
64	'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_jpg}"},
65	{ 'name' => "pagedimgtxt_png",
66	'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_png}"},
67	];
68
69
70	my $arguments =
71	[
72	{ 'name' => "convert_to",
73	'desc' => "{ConvertBinaryFile.convert_to}",
74	'type' => "enum",
75	'reqd' => "yes",
76	'list' => $convert_to_list,
77	'deft' => "pretty_html" },
78	{ 'name' => "process_exp",
79	'desc' => "{BaseImporter.process_exp}",
80	'type' => "regexp",
81	'deft' => &get_default_process_exp(),
82	'reqd' => "no" },
83	{ 'name' => "block_exp",
84	'desc' => "{CommonUtil.block_exp}",
85	'type' => "regexp",
86	'deft' => &get_default_block_exp() },
87	{ 'name' => "metadata_fields",
88	'desc' => "{HTMLPlugin.metadata_fields}",
89	'type' => "string",
90	'deft' => "Title,Author,Subject,Keywords" },
91	{ 'name' => "metadata_field_separator",
92	'desc' => "{HTMLPlugin.metadata_field_separator}",
93	'type' => "string",
94	'deft' => "" },
95	# { 'name' => "noimages",
96	# 'desc' => "{PDFPlugin.noimages}",
97	# 'type' => "flag" },
98	# { 'name' => "allowimagesonly",
99	# 'desc' => "{PDFPlugin.allowimagesonly}",
100	# 'type' => "flag" },
101	# { 'name' => "complex",
102	# 'desc' => "{PDFPlugin.complex}",
103	# 'type' => "flag" },
104	# { 'name' => "nohidden",
105	# 'desc' => "{PDFPlugin.nohidden}",
106	# 'type' => "flag" },
107	{ 'name' => "zoom",
108	'desc' => "{PDFv2Plugin.zoom}",
109	'deft' => "1",
110	'type' => "string" }, # xpdftools' zoom takes fractions
111	# { 'name' => "use_sections",
112	# 'desc' => "{PDFPlugin.use_sections}",
113	# 'type' => "flag" },
114	# { 'name' => "description_tags",
115	# 'desc' => "{HTMLPlugin.description_tags}",
116	# 'type' => "flag" },
117	{ 'name' => "use_realistic_book",
118	'desc' => "{PDFPlugin.use_realistic_book}",
119	'type' => "flag"}
120	];
121
122	my $options = { 'name' => "PDFv2Plugin",
123	'desc' => "{PDFPlugin.desc}",
124	'abstract' => "no",
125	'inherits' => "yes",
126	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
127	'args' => $arguments };
128
129	sub new {
130	my ($class) = shift (@_);
131	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
132	push(@$pluginlist, $class);
133
134	push(@$inputargs,"-title_sub");
135	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
136
137	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
138	push(@{$hashArgOptLists->{"OptList"}},$options);
139
140	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
141	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
142	my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
143
144	if ($self->{'info_only'}) {
145	# don't worry about any options etc
146	return bless $self, $class;
147	}
148
149	$self = bless $self, $class;
150	$self->{'file_type'} = "PDF";
151
152	# convert_options are passed through to gsConvert.pl by ConvertBinaryFile.pm
153
154	# the most important option is the tool that's used to do the conversion
155	$self->{'convert_options'} = "-pdf_tool xpdftools"; # default for PDFv2Plugin. If pdfbox_conversion is on, the pdfbpox GS extension sets pdf_tool to pdfbox
156
157	# pdf_zoom is supported by xpdftools' pdftohtml. So for pretty_html and paged_pretty_html
158	my $zoom = $self->{"zoom"};
159	$self->{'convert_options'} .= " -pdf_zoom $zoom";
160
161	# PDFv2Plugin now supports PDF to txt conversion on Windows too:
162	# using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
163	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
164	&gsprintf::gsprintf(STDERR, "{PDFv2Plugin.win_pdftotext_info}\n");
165	}
166	elsif ($self->{'convert_to'} eq "auto") {
167	# choose pretty_html is the best default option when using xpdftools
168	$self->{'convert_to'} = "pretty_html";
169	}
170	if ($self->{'use_realistic_book'}) {
171	if ($self->{'convert_to'} ne "html") {
172	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
173	$self->{'convert_to'} = "html";
174	}
175	}
176
177	# if pdfbox_conversion is not on, check convert_to to make sure that xpdftools can
178	# support the selected output format, or fallback on a sensible default
179	# Not all available conversion output options are possible with xpdftools, as some are
180	# only handled by pdfbox. If a format is unavailable with xpdftools, default to pretty_html
181	if (!$self->{"pdfbox_conversion"}) {
182	my $convert_to = $self->{'convert_to'};
183	my $fallback_convert_to = $convert_to;
184	if($convert_to =~ /^html$/) {
185	$fallback_convert_to = "pretty_html";
186	}
187	elsif ($self->{'convert_to'} =~ /^pagedimg/) {
188	$fallback_convert_to = "paged_pretty_html";
189	}
190	elsif ($self->{'convert_to'} =~ /^paged_text$/) {
191	# print STDERR "@@@ Conversion to " . $self->{'convert_to'} , " with Xpdf Tools is not yet implemented.\n";
192	$fallback_convert_to = "text";
193	}
194
195	if($convert_to =~ /^(html\|pagedimg\|paged_text)/) {
196	&gsprintf::gsprintf(STDERR, "{PDFv2Plugin.conversion_needs_pdfbox}\n", ($self->{'convert_to'}, $fallback_convert_to));
197	$self->{'convert_to'} = $fallback_convert_to;
198	}
199	}
200
201	# set convert_to_plugin and convert_to_ext
202	$self->set_standard_convert_settings();
203
204	my $secondary_plugin_name = $self->{'convert_to_plugin'};
205	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
206
207	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
208	$secondary_plugin_options->{$secondary_plugin_name} = [];
209	}
210	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
211
212	# following title_sub removes "Page 1" added by pdftohtml, and a leading
213	# "1", which is often the page number at the top of the page. Bad Luck
214	# if your document title actually starts with "1 " - is there a better way?
215	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
216	my $associate_tail_re = $self->{'associate_tail_re'};
217	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
218	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
219	}
220	push(@$specific_options, "-file_rename_method", "none");
221
222	if ($secondary_plugin_name eq "HTMLPlugin") {
223	# pdftohtml always produces utf8 - What about pdfbox???
224	# push(@$specific_options, "-input_encoding", "utf8");
225	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
226	push(@$specific_options, "-processing_tmp_files");
227	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
228	# to extract these metadata fields from the HEAD META fields
229	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
230	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
231	} else {
232	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
233	}
234	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
235	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
236	}
237	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
238	$self->{'description_tags'} = 1;
239	push(@$specific_options, "-description_tags");
240	}
241	if ($self->{'use_realistic_book'}) {
242	push(@$specific_options, "-use_realistic_book");
243	}
244	if($self->{'convert_to'} eq "paged_pretty_html") { # for paged pretty html, the default should be to sectionalise
245	# the single superpage, the one containing divs representing individual pages as sections, on headings
246	push(@$specific_options, "sectionalise_using_h_tags");
247	}
248	}
249	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
250	push(@$specific_options, "-screenviewsize", "1000");
251	push(@$specific_options, "-enable_cache");
252	push(@$specific_options, "-processing_tmp_files");
253	}
254
255	$self = bless $self, $class;
256	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
257	return $self;
258	}
259
260	sub get_default_process_exp {
261	my $self = shift (@_);
262
263	return q^(?i)\.pdf$^;
264	}
265
266	# so we don't inherit HTMLPlug's block exp...
267	sub get_default_block_exp {
268	return "";
269	}
270
271	sub init {
272	my $self = shift (@_);
273
274	# ConvertBinaryFile init
275	$self->SUPER::init(@_);
276	$self->AutoLoadConverters::init(@_);
277
278	}
279
280	sub begin {
281	my $self = shift (@_);
282
283	$self->AutoLoadConverters::begin(@_);
284	$self->SUPER::begin(@_);
285
286	}
287
288	sub deinit {
289	my $self = shift (@_);
290
291	$self->AutoLoadConverters::deinit(@_);
292	$self->SUPER::deinit(@_);
293
294	}
295
296	# By setting hashing to be on ga xml this ensures that two
297	# PDF files that are identical except for the metadata
298	# to hash to different values. Without this, when each PDF
299	# file is converted to HTML there is a chance that they
300	# will both be identical if the conversion utility does
301	# not embed the metadata in the generated HTML. This is
302	# certainly the case when PDFBOX is being used.
303
304	# This change makes this convert to based plugin more
305	# consistent with the original vision that the same document
306	# with different metadata should
307	# be seen as different.
308
309	sub get_oid_hash_type {
310	my $self = shift (@_);
311	return "hash_on_ga_xml";
312	}
313
314
315	sub tmp_area_convert_file {
316
317	my $self = shift (@_);
318
319	if($self->{'convert_to'} =~ m/pretty_html$/) { # if outputting paged_pretty_html or pretty_html:
320	# only xpdftools can output pretty_html regardless of whether pdfbox_conversion is switched on
321	print STDERR "@@@@ PDFBox_conversion is switched on, but pretty_html variants are generated by xpdftools.\n";
322	return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
323	}
324	# else, output format uses pdfbox:
325	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
326
327	}
328
329	# Overriding to do some extra handling for pretty_html/paged_pretty_html output mode
330	sub run_conversion_command {
331	my $self = shift (@_);
332	my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
333
334	if($self->{'convert_to'} !~ m/pretty_html$/) {
335	return $self->ConvertBinaryFile::run_conversion_command(@_);
336	}
337
338	# else, paged_pretty_html or pretty_html
339
340	# if output mode is (paged_)pretty_html, we use Xpdf tools' pdftohtml and tell it
341	# to create a subdir called "pages" in the tmp area to puts its products
342	# in there. (Xpdf's pdftohtml needs to be passed a non-existent directory
343	# parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
344	# the intermediary output file tmp/<random-num>/pages/index.html should
345	# exist (besides other output products there)
346
347	# We let ConvertBinaryFile proceed normally, but the return value should reflect
348	# that on success it should expect the intermediary product tmpdir/pages/index.html
349	# (which is the product of xpdftohtml conversion).
350	my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
351	$output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
352
353	# However, when convert_post_process() is done, it should have output the final
354	# product of the (paged_)pretty_html conversion: an html file of the same name and in the
355	# same tmp location as the input PDF file.
356
357	my ($name_prefix, $output_dir, $ext)
358	= &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
359	$self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
360	# print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
361
362	return $output_filename;
363	}
364
365	sub convert_post_process
366	{
367	my $self = shift (@_);
368	my ($conv_filename) = @_;
369
370	my $outhandle=$self->{'outhandle'};
371
372	if($self->{'convert_to'} =~ /pretty_html/) { # (paged_)pretty_html
373	# special post-processing for (paged_)pretty_html mode, as HTML pages generated
374	# by xpdf's pdftohtml need to be massaged into the form we want
375	$self->xpdftohtml_convert_post_process($conv_filename);
376	}
377	else { # use PDFPlugin's usual post processing
378	$self->default_convert_post_process($conv_filename);
379	}
380	}
381
382	# Called after gsConvert.pl has been run to convert a PDF to (paged_)pretty_html
383	# using Xpdftools' pdftohtml
384	# This method will do some cleanup of the HTML files produced after XPDF has produced
385	# an HTML doc for each PDF page: it first gets rid of the default index.html.
386	# Instead, it constructs a single html page containing each original HTML page
387	# <body> nested as divs instead, with simple section information inserted at the top
388	# of each 'page' <div> and some further styling customisation. This HTML manipulation
389	# is to be done with the Mojo::DOM perl package.
390	# Note that since xpdf's pdftohtml would have failed if the output dir already
391	# existed and for simpler naming, the output files are created in a new "pages"
392	# subdirectory of the tmp location parent of $conv_filename instead
393	sub xpdftohtml_convert_post_process
394	{
395	my $self = shift (@_);
396	my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for (paged_)pretty_html output mode
397	my $output_filename = $self->{'conv_filename_after_post_process'};
398
399	# Read in all the html files in tmp's "pages" subdir, except for index.html.
400	# and use it to create a new html file called $self->{'conv_filename_after_post_process'}
401	# which will consist of a slightly modified version of
402	# each of the other html files concatenated together.
403
404	my $outhandle=$self->{'outhandle'};
405
406	my ($tailname, $pages_subdir, $suffix)
407	= &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
408
409	# Code from util::create_itemfile()
410	# Read in all the files
411	opendir(DIR, $pages_subdir) \|\| die "can't opendir $pages_subdir: $!";
412	my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
413	closedir DIR;
414	# Sort files in the directory by page_num
415	# files are named index.html, page1.html, page2.html, ..., pagen.html
416	sub page_number {
417	my ($dir) = @_;
418	my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
419	$pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
420	return $pagenum;
421	}
422	# sort the files in the directory in the order of page_num rather than lexically.
423	@page_files = sort { page_number($a) <=> page_number($b) } @page_files;
424
425	#my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
426	# For every html file there's an img file, so halve the total num.
427	# What about other file types that may potentially be there too???
428	my $num_html_pages = 0;
429	foreach my $pagefile (@page_files) {
430	$num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
431	}
432
433	# Prepare to create our new html page that will contain all the individual
434	# htmls generated by xpdf's pdftohtml in sequence.
435	# First write the opening html tags out to the output file. These are the
436	# same tags and their contents, including <meta>, as is generated by
437	# Xpdf's pdftohtml for each of its individual html pages.
438	my $start_text = "<html>\n<head>\n";
439	my ($output_tailname, $tmp_subdir, $html_suffix)
440	= &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
441	$start_text .= "<title>$output_tailname</title>\n";
442	$start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
443	$start_text .= "</head>\n<body>\n\n";
444
445	if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the <h>tags for sectionalising
446	$start_text .= "<h1>$output_tailname</h1>\n\n";
447	}
448
449	#handle content encodings the same way that default_convert_post_process does
450	# $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
451	# Don't want to build a giant string in memory of all the pages concatenated
452	# and then write it out in one go. Instead, build up the final single page
453	# by writing each modified (paged_)pretty_html file out to it as this is processed.
454	# Copying file open/close code from CommonUtil::utf8_write_file()
455	if (!open (OUTFILE, ">:utf8", $output_filename)) {
456	gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);
457	die "\n";
458	}
459	print OUTFILE $start_text;
460
461	# Get the contents of each individual HTML page generated by Xpdf, after first
462	# modifying each, and write each out into our single all-encompassing html
463	foreach my $pagefile (@page_files) {
464	if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
465	my $page_num = page_number($pagefile);
466	# get full path to pagefile
467	$pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
468	# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
469	my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages);
470	print OUTFILE "$modified_page_contents\n\n";
471	}
472	}
473
474	# we've now created a single HTML file by concatenating (a modified version)
475	# of each paged html file
476	print OUTFILE "</body>\n</html>\n"; # write out closing tags
477	close OUTFILE; # done
478
479	# Get rid of all the htm(l) files incl index.html in the associated "pages"
480	# subdir, since we've now processed them all into a single html file
481	# one folder level up and we don't want HTMLPlugin to process all of them next.
482	&FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
483
484	# now the tmp area should contain a single html file contain all the html pages'
485	# contents in sequence, and a "pages" subdir containing the screenshot images
486	# of each page.
487	# HTMLPlugin will process these further in the plugin pipeline
488	}
489
490	# For whatever reason, most html <tags> don't get printed out in GLI
491	# So when debugging, use this function to print them out as [tags] instead.
492	sub _debug_print_html
493	{
494	my $self = shift (@_);
495	my ($string_or_dom) = @_;
496
497	# can't seem to determine type of string with ref/reftype
498	# https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
499	# Not needed, as $dom objects seem to get correctly stringified in string contexts
500	# $dom.to_string/$dom.stringify seem to get called, no need to call them
501	# https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
502	my $escapedTxt = $string_or_dom;
503	$escapedTxt =~ s@\<@[@sg;
504	$escapedTxt =~ s@\>@]@sg;
505
506	print STDERR "#### $escapedTxt\n";
507	}
508
509	# Helper function for (paged_)pretty_html
510	# to read in each page of pretty_html generated by Xpdf's pdftohtml
511	# then modify the html suitably using the HTML parsing functions offered by
512	# Mojo::DOM, then return the modified HTML content as a string.
513	# For paged_pretty_html, some additional modification is done to sectionalise the final html
514	# See https://mojolicious.org/perldoc/Mojo/DOM
515	sub _process_pretty_html_page
516	{
517	my $self = shift (@_);
518	my ($pagefile, $page_num, $num_html_pages) = @_;
519
520	my $text = "";
521
522	# handling content encoding the same way default_convert_post_process does
523	$self->read_file ($pagefile, "utf8", "", \$text);
524
525	my $dom = Mojo::DOM->new($text);
526
527	# $self->_debug_print_html($dom);
528
529	# there's a <style> element on the <html>, we need to shift it into the <div>
530	# tag that we'll be creating. We'll first slightly modify the <style> element
531	# store the first style element, which is the only one and in the <body>
532	# we'll later insert it as child of an all-encompassing div that we'll create
533	my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
534	# In the style tag, convert id style references to class style references
535	my $css_class = ".p".$page_num."f";
536	$page_style_tag_str =~ s@\#f@$css_class@sg;
537	my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
538	#$self->_debug_print_html($style_element);
539
540	# need to know the image's height to set the height of the surrounding
541	# div that's to replace this page's <body>:
542	my $img_height = $dom->find('img')->[0]{height};
543
544	# 2. Adjust the img#background src attribute to point to the pages subdir for imgs
545	# 3. Set that img tag's class=background, and change its id to background+$page_num
546	my $bg_img_tag=$dom->find('img#background')->[0];
547	my $img_src_str = $bg_img_tag->{src};
548	$img_src_str = "pages/$img_src_str";
549	$bg_img_tag->attr(src => $img_src_str); # reset
550	#$self->_debug_print_html($bg_img_tag);
551	# set both class and modified id attributes in one step:
552	$bg_img_tag->attr({class => "background", id => "background".$page_num});
553	#$self->_debug_print_html($bg_img_tag);
554
555	# get all the <span> nested inside <div class="txt"> elements and
556	# 1. set their class attr to be "p + page_num + id-of-the-span",
557	# 2. then delete the id, because the span ids have been reused when element
558	# ids ought to be unique. Which is why we set the modified ids to be the
559	# value of the class attribute instead
560	$dom->find('div.txt span')->each(sub {
561	$_->attr(class => "p". $page_num. $_->{id});
562	delete $_->{id};
563	}); # both changes done in one find() operation
564	#$self->_debug_print_html($dom->find('div.txt span')->last);
565
566	# Finally can create our new dom, starting with a div tag for the current page
567	# Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
568	# my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
569	my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
570	#$self->_debug_print_html($new_dom);
571	$new_dom->at('div')->append_content($style_element)->root;
572
573
574	#$self->_debug_print_html($new_dom);
575	# Copy across all the old html's body tag's child nodes into the new dom's new div tag
576	$dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
577	#$self->_debug_print_html($new_dom);
578
579	# build up the outer div
580	my $inner_div_str = $new_dom->to_string;
581	my $page_div = "<div id=\"page".$page_num."\">\n";
582
583	if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the <h>tags for sectionalising
584
585	# Append a page range bucket heading if applicable: if we have more than 10 pages
586	# to display in the current bucket AND we're on the first page of each bucket of 10 pages.
587	# Dr Bainbridge thinks for now we need only consider PDFs where the
588	# total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
589	# If number of remaining pages >= 10, then create new bucket heading
590	# e.g. "Pages 30-40"
591	if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
592	# Double-digit page numbers that start with 2
593	# i.e. 21 to 29 (and 30) should be in 21 to 30 range
594	my $start_range = $page_num - ($page_num % 10) + 1;
595	my $end_range = $page_num + 10 - ($page_num % 10);
596	$page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
597	}
598
599	# No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
600	if($num_html_pages > 10) {
601	# Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
602	$page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";
603	}
604	}
605
606	$page_div .= $inner_div_str;
607	$page_div .= "\n</div>";
608
609	# Finished processing a single html page of the (paged_)pretty_html output generated by
610	# Xpdf's pdftohtml: finished massaging that single html page into the right form
611	return $page_div;
612	}
613
614	# This subroutine is called to do the PDFv2Plugin post-processing for all cases
615	# except the "pretty_html" or "paged_pretty_html" conversion modes.
616	# This is what PDFPlugin always used to do:
617	sub default_convert_post_process
618	{
619	my $self = shift (@_);
620	my ($conv_filename) = @_;
621	my $outhandle=$self->{'outhandle'};
622
623	#$self->{'input_encoding'} = "utf8"; # TODO: The output is always in utf8 (is it?? it is for html, but what about other types?)
624	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
625
626	# read in file ($text will be in utf8)
627	my $text = "";
628	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
629	$self->read_file ($conv_filename, "utf8", "", \$text);
630
631	# To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
632	# sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
633	# which it then splits on to generate page-based sections. However, that's not what PDFBox
634	# generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
635	# embeds each page in an extra div. The div opener is:
636	# <div style=\"page-break-before:always; page-break-after:always\">
637	# The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
638	# pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
639	# a regex substitution even with regex extensions on.) Later, when we process each section
640	# to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
641	# that increments the pagenum for each subsequent section.
642
643	#$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
644	my $loopcounter = 0; # used later on!
645	$text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
646
647
648	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
649	# for each page). Metadata based on this calculation not set until process()
650	#
651	# Note: this is done even if we are not breaking the document into pages as it might
652	# be useful to give an indication of document length in browser through setting
653	# num_pages as metadata.
654	# Clean html from low and hight surrogates D800âDFFF
655	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
656	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
657	my $num_pages = scalar(@pages);
658	$self->{'num_pages'} = $num_pages;
659
660	if ($self->{'use_sections'}
661	&& $self->{'converted_to'} eq "HTML") {
662
663	print $outhandle "PDFv2Plugin: Calculating sections...\n";
664
665	# we have "<a name=1></a>" etc for each page
666	# it may be <A name=
667	my @sections = split('<[Aa] name=', $text);
668
669	my $top_section = "";
670
671	if (scalar (@sections) == 1) { #only one section - no split!
672	print $outhandle "PDFv2Plugin: warning - no sections found\n";
673	} else {
674	$top_section .= shift @sections; # keep HTML header etc as top_section
675	}
676
677	# handle first section specially for title? Or all use first 100...
678
679	my $title = $sections[0];
680	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
681	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
682	$title =~ s/<[^>]*>/ /g;
683	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
684	$title =~ s/^\s+//s;
685	$title =~ s/\s+$//;
686	$title =~ s/\s+/ /gs;
687	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
688	$title =~ s/^\s+//s; # in case title_sub introduced any...
689	$title = substr ($title, 0, 100);
690	$title =~ s/\s\S*$/.../;
691
692
693	if (scalar (@sections) == 1) { # no sections found
694	$top_section .= $sections[0];
695	@sections=();
696	} else {
697	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
698	}
699
700	# add metadata per section...
701	foreach my $section (@sections) {
702	# section names are not always just digits, may be like "outline"
703	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
704
705	$title = $1; # Greenstone does magic if sections are titled digits
706
707	# A title of pagenum=0 means use_sections is being applied on output from PDFBox,
708	# which didn't originally have a <a name=incremented pagenumber></a> to split each page.
709	# Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
710	if($loopcounter > 0 \|\| ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
711	$title = ++$loopcounter;
712	}
713
714	if (! defined($title) ) {
715	print STDERR "no title: $section\n";
716	$title = " "; # get rid of the undefined warning in next line
717	}
718	my $newsection = "<!-- from PDFv2Plugin -->\n<!-- <Section>\n";
719	$newsection .= "<Metadata name=\"Title\">" . $title
720	. "</Metadata>\n--><br />\n";
721	$newsection .= $section;
722	$newsection .= "<!--</Section>-->\n";
723	$section = $newsection;
724	}
725
726	$text=join('', ($top_section, @sections));
727	}
728
729	if ($self->{'use_sections'}
730	&& $self->{'converted_to'} eq "text") {
731	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
732	}
733
734
735	# The following should no longer be needed, now that strings
736	# read in are Unicode aware (in the Perl sense) rather than
737	# raw binary strings that just happen to be UTF-8 compliant
738
739	# turn any high bytes that aren't valid utf-8 into utf-8.
740	## unicode::ensure_utf8(\$text);
741
742	# Write it out again!
743	$self->utf8_write_file (\$text, $conv_filename);
744	}
745
746
747	# do plugin specific processing of doc_obj for HTML type
748	sub process {
749	my $self = shift (@_);
750	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
751
752	my $result = $self->process_type($base_dir,$file,$doc_obj);
753
754	# fix up the extracted date metadata to be in Greenstone date format,
755	# and fix the capitalisation of 'date'
756	my $cursection = $doc_obj->get_top_section();
757	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
758	$doc_obj->delete_metadata($cursection, "date", $datemeta);
759
760	# We're just interested in the date bit, not the time
761	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
762	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
763	# extracts the ModDate, so it is 0...
764	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
765	my ($year, $month, $day) = ($1,$2,$3);
766	if (defined($year) && defined($month) && defined($day)) {
767	if ($year == 0) {next}
768	if ($year < 100) {$year += 1900} # just to be safe
769	if ($month =~ /^\d$/) {$month="0$month"} # single digit
770	if ($day =~ /^\d$/) {$day="0$day"} # single digit
771	my $date="$year$month$day";
772	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
773	}
774	}
775
776	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
777
778	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
779	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
780	# right if any section has an empty title, or one with letters in it
781	if (&util::is_gs3()) {
782	# but for gs3, paged docs currently use image slider which is ugly if there are no images
783	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
784	} else {
785	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
786	}
787	}
788
789	return $result;
790	}
791
792	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm@ 32283

Download in other formats: