Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago
First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 31.2 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use strict;
28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29	no strict 'subs'; # allow filehandles to be variables and viceversa
30
31	use ReadTextFile;
32	use unicode;
33	use Mojo::DOM; # for HTML parsing
34
35	use AutoLoadConverters;
36	use ConvertBinaryFile;
37
38	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
39
40
41	my $convert_to_list =
42	[ { 'name' => "auto",
43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44	{ 'name' => "html",
45	'desc' => "{ConvertBinaryFile.convert_to.html}" },
46	{ 'name' => "text",
47	'desc' => "{ConvertBinaryFile.convert_to.text}" },
48	{ 'name' => "paged_html",
49	'desc' => "{PDFPlugin.convert_to.paged_html}"},
50	{ 'name' => "pagedimg_jpg",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
52	{ 'name' => "pagedimg_gif",
53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
54	{ 'name' => "pagedimg_png",
55	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
56	];
57
58
59	my $arguments =
60	[
61	{ 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "html" },
67	{ 'name' => "process_exp",
68	'desc' => "{BaseImporter.process_exp}",
69	'type' => "regexp",
70	'deft' => &get_default_process_exp(),
71	'reqd' => "no" },
72	{ 'name' => "block_exp",
73	'desc' => "{CommonUtil.block_exp}",
74	'type' => "regexp",
75	'deft' => &get_default_block_exp() },
76	{ 'name' => "metadata_fields",
77	'desc' => "{HTMLPlugin.metadata_fields}",
78	'type' => "string",
79	'deft' => "Title,Author,Subject,Keywords" },
80	{ 'name' => "metadata_field_separator",
81	'desc' => "{HTMLPlugin.metadata_field_separator}",
82	'type' => "string",
83	'deft' => "" },
84	{ 'name' => "noimages",
85	'desc' => "{PDFPlugin.noimages}",
86	'type' => "flag" },
87	{ 'name' => "allowimagesonly",
88	'desc' => "{PDFPlugin.allowimagesonly}",
89	'type' => "flag" },
90	{ 'name' => "complex",
91	'desc' => "{PDFPlugin.complex}",
92	'type' => "flag" },
93	{ 'name' => "nohidden",
94	'desc' => "{PDFPlugin.nohidden}",
95	'type' => "flag" },
96	{ 'name' => "zoom",
97	'desc' => "{PDFPlugin.zoom}",
98	'deft' => "2",
99	# 'range' => "1,3", # actually the range is 0.5-3
100	'type' => "string" },
101	{ 'name' => "use_sections",
102	'desc' => "{PDFPlugin.use_sections}",
103	'type' => "flag" },
104	{ 'name' => "description_tags",
105	'desc' => "{HTMLPlugin.description_tags}",
106	'type' => "flag" },
107	{ 'name' => "use_realistic_book",
108	'desc' => "{PDFPlugin.use_realistic_book}",
109	'type' => "flag"}
110	];
111
112	my $options = { 'name' => "PDFPlugin",
113	'desc' => "{PDFPlugin.desc}",
114	'abstract' => "no",
115	'inherits' => "yes",
116	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
117	'args' => $arguments };
118
119	sub new {
120	my ($class) = shift (@_);
121	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
122	push(@$pluginlist, $class);
123
124	push(@$inputargs,"-title_sub");
125	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
126
127	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128	push(@{$hashArgOptLists->{"OptList"}},$options);
129
130	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
131	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
132	my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
133
134	if ($self->{'info_only'}) {
135	# don't worry about any options etc
136	return bless $self, $class;
137	}
138
139	$self = bless $self, $class;
140	$self->{'file_type'} = "PDF";
141
142	# PDFPlugin is deprecated and migrating users should hereafter choose between
143	# PDFv1Plugin, if they want to use the old pdftohtml tool's capabilities,
144	# and PDFv2Plugin, if they want to use pdfbox or the new xpdftools capabilities.
145	&gsprintf::gsprintf(STDERR, "{PDFPlugin.deprecated_plugin}");
146
147	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
148	my $zoom = $self->{"zoom"};
149	# By default, PDFPlugin assumes gsConvert.pl will run the old pdftohtml conversion tool,
150	# But if pdfbox conversion is turned on, the tool used is pdfbox (which is presently an
151	# AutoLoadConverter and therefore bypasses gsConvert.pl)
152	$self->{'convert_options'} = "-pdf_tool pdftohtml";
153	$self->{'convert_options'} .= " -pdf_zoom $zoom";
154	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
155	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
156	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
157	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
158
159	# check convert_to
160	# TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
161	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
162	#print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n";
163	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
164	$self->{'convert_to'} = "html";
165	}
166	elsif ($self->{'convert_to'} eq "auto") {
167	# choose html ?? is this the best option
168	$self->{'convert_to'} = "paged_html";
169	}
170	if ($self->{'use_realistic_book'}) {
171	if ($self->{'convert_to'} ne "html") {
172	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
173	$self->{'convert_to'} = "html";
174	}
175	}
176	# set convert_to_plugin and convert_to_ext
177	$self->set_standard_convert_settings();
178
179	my $secondary_plugin_name = $self->{'convert_to_plugin'};
180	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
181
182	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
183	$secondary_plugin_options->{$secondary_plugin_name} = [];
184	}
185	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
186
187	# following title_sub removes "Page 1" added by pdftohtml, and a leading
188	# "1", which is often the page number at the top of the page. Bad Luck
189	# if your document title actually starts with "1 " - is there a better way?
190	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
191	my $associate_tail_re = $self->{'associate_tail_re'};
192	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
193	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
194	}
195	push(@$specific_options, "-file_rename_method", "none");
196
197	if ($secondary_plugin_name eq "HTMLPlugin") {
198	# pdftohtml always produces utf8 - What about pdfbox???
199	# push(@$specific_options, "-input_encoding", "utf8");
200	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
201	push(@$specific_options, "-processing_tmp_files");
202	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
203	# to extract these metadata fields from the HEAD META fields
204	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
205	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
206	} else {
207	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
208	}
209	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
210	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
211	}
212	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
213	$self->{'description_tags'} = 1;
214	push(@$specific_options, "-description_tags");
215	}
216	if ($self->{'use_realistic_book'}) {
217	push(@$specific_options, "-use_realistic_book");
218	}
219	if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section
220	push(@$specific_options, "sectionalise_using_h_tags");
221	}
222	}
223	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
224	push(@$specific_options, "-screenviewsize", "1000");
225	push(@$specific_options, "-enable_cache");
226	push(@$specific_options, "-processing_tmp_files");
227	}
228
229	$self = bless $self, $class;
230	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
231	return $self;
232	}
233
234	sub get_default_process_exp {
235	my $self = shift (@_);
236
237	return q^(?i)\.pdf$^;
238	}
239
240	# so we don't inherit HTMLPlug's block exp...
241	sub get_default_block_exp {
242	return "";
243	}
244
245	sub init {
246	my $self = shift (@_);
247
248	# ConvertBinaryFile init
249	$self->SUPER::init(@_);
250	$self->AutoLoadConverters::init(@_);
251
252	}
253
254	sub begin {
255	my $self = shift (@_);
256
257	$self->AutoLoadConverters::begin(@_);
258	$self->SUPER::begin(@_);
259
260	}
261
262	sub deinit {
263	my $self = shift (@_);
264
265	$self->AutoLoadConverters::deinit(@_);
266	$self->SUPER::deinit(@_);
267
268	}
269
270	# By setting hashing to be on ga xml this ensures that two
271	# PDF files that are identical except for the metadata
272	# to hash to different values. Without this, when each PDF
273	# file is converted to HTML there is a chance that they
274	# will both be identical if the conversion utility does
275	# not embed the metadata in the generated HTML. This is
276	# certainly the case when PDFBOX is being used.
277
278	# This change makes this convert to based plugin more
279	# consistent with the original vision that the same document
280	# with different metadata should
281	# be seen as different.
282
283	sub get_oid_hash_type {
284	my $self = shift (@_);
285	return "hash_on_ga_xml";
286	}
287
288
289	sub tmp_area_convert_file {
290
291	my $self = shift (@_);
292	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
293
294	}
295
296	# Overriding to do some extra handling for paged_html output mode
297	sub run_conversion_command {
298	my $self = shift (@_);
299	my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
300
301	if($self->{'convert_to'} ne "paged_html") {
302	return $self->ConvertBinaryFile::run_conversion_command(@_);
303	}
304
305	# if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
306	# to create a subdir called "pages" in the tmp area to puts its products
307	# in there. (Xpdf's pdftohtml needs to be passed a non-existent directory
308	# parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
309	# the intermediary output file tmp/<random-num>/pages/index.html should
310	# exist (besides other output products there)
311
312	# We let ConvertBinaryFile proceed normally, but the return value should reflect
313	# that on success it should expect the intermediary product tmpdir/pages/index.html
314	# (which is the product of xpdftohtml conversion).
315	my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
316	$output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
317
318	# However, when convert_post_process() is done, it should have output the final
319	# product of the paged_html conversion: an html file of the same name and in the
320	# same tmp location as the input PDF file.
321
322	my ($name_prefix, $output_dir, $ext)
323	= &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
324	$self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
325	# print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
326
327	return $output_filename;
328	}
329
330	sub convert_post_process
331	{
332	my $self = shift (@_);
333	my ($conv_filename) = @_;
334
335	my $outhandle=$self->{'outhandle'};
336
337	if($self->{'convert_to'} eq "paged_html") {
338	# special post-processing for paged_html mode, as HTML pages generated
339	# by xpdf's pdftohtml need to be massaged into the form we want
340	$self->xpdftohtml_convert_post_process($conv_filename);
341	}
342	else { # use PDFPlugin's usual post processing
343	$self->default_convert_post_process($conv_filename);
344	}
345	}
346
347	# Called after gsConvert.pl has been run to convert a PDF to paged_html
348	# using Xpdftools' pdftohtml
349	# This method will do some cleanup of the HTML files produced after XPDF has produced
350	# an HTML doc for each PDF page: it first gets rid of the default index.html.
351	# Instead, it constructs a single html page containing each original HTML page
352	# <body> nested as divs instead, with simple section information inserted at the top
353	# of each 'page' <div> and some further styling customisation. This HTML manipulation
354	# is to be done with the Mojo::DOM perl package.
355	# Note that since xpdf's pdftohtml would have failed if the output dir already
356	# existed and for simpler naming, the output files are created in a new "pages"
357	# subdirectory of the tmp location parent of $conv_filename instead
358	sub xpdftohtml_convert_post_process
359	{
360	my $self = shift (@_);
361	my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
362	my $output_filename = $self->{'conv_filename_after_post_process'};
363
364	# Read in all the html files in tmp's "pages" subdir, except for index.html.
365	# and use it to create a new html file called $self->{'conv_filename_after_post_process'}
366	# which will consist of a slightly modified version of
367	# each of the other html files concatenated together.
368
369	my $outhandle=$self->{'outhandle'};
370
371	my ($tailname, $pages_subdir, $suffix)
372	= &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
373
374	# Code from util::create_itemfile()
375	# Read in all the files
376	opendir(DIR, $pages_subdir) \|\| die "can't opendir $pages_subdir: $!";
377	my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
378	closedir DIR;
379	# Sort files in the directory by page_num
380	# files are named index.html, page1.html, page2.html, ..., pagen.html
381	sub page_number {
382	my ($dir) = @_;
383	my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
384	$pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
385	return $pagenum;
386	}
387	# sort the files in the directory in the order of page_num rather than lexically.
388	@page_files = sort { page_number($a) <=> page_number($b) } @page_files;
389
390	#my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
391	# For every html file there's an img file, so halve the total num.
392	# What about other file types that may potentially be there too???
393	my $num_html_pages = 0;
394	foreach my $pagefile (@page_files) {
395	$num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
396	}
397
398	# Prepare to create our new html page that will contain all the individual
399	# htmls generated by xpdf's pdftohtml in sequence.
400	# First write the opening html tags out to the output file. These are the
401	# same tags and their contents, including <meta>, as is generated by
402	# Xpdf's pdftohtml for each of its individual html pages.
403	my $start_text = "<html>\n<head>\n";
404	my ($output_tailname, $tmp_subdir, $html_suffix)
405	= &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
406	$start_text .= "<title>$output_tailname</title>\n";
407	$start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
408	$start_text .= "</head>\n<body>\n\n";
409	$start_text .= "<h1>$output_tailname</h1>\n\n";
410
411	#handle content encodings the same way that default_convert_post_process does
412	# $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
413	# Don't want to build a giant string in memory of all the pages concatenated
414	# and then write it out in one go. Instead, build up the final single page
415	# by writing each modified paged_html file out to it as this is processed.
416	# Copying file open/close code from CommonUtil::utf8_write_file()
417	if (!open (OUTFILE, ">:utf8", $output_filename)) {
418	gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);
419	die "\n";
420	}
421	print OUTFILE $start_text;
422
423	# Get the contents of each individual HTML page generated by Xpdf, after first
424	# modifying each, and write each out into our single all-encompassing html
425	foreach my $pagefile (@page_files) {
426	if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
427	my $page_num = page_number($pagefile);
428	# get full path to pagefile
429	$pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
430	# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
431	my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
432	print OUTFILE "$modified_page_contents\n\n";
433	}
434	}
435
436	# we've now created a single HTML file by concatenating (a modified version)
437	# of each paged html file
438	print OUTFILE "</body>\n</html>\n"; # write out closing tags
439	close OUTFILE; # done
440
441	# Get rid of all the htm(l) files incl index.html in the associated "pages"
442	# subdir, since we've now processed them all into a single html file
443	# one folder level up and we don't want HTMLPlugin to process all of them next.
444	&FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
445
446	# now the tmp area should contain a single html file contain all the html pages'
447	# contents in sequence, and a "pages" subdir containing the screenshot images
448	# of each page.
449	# HTMLPlugin will process these further in the plugin pipeline
450	}
451
452	# For whatever reason, most html <tags> don't get printed out in GLI
453	# So when debugging, use this function to print them out as [tags] instead.
454	sub _debug_print_html
455	{
456	my $self = shift (@_);
457	my ($string_or_dom) = @_;
458
459	# can't seem to determine type of string with ref/reftype
460	# https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
461	# Not needed, as $dom objects seem to get correctly stringified in string contexts
462	# $dom.to_string/$dom.stringify seem to get called, no need to call them
463	# https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
464	my $escapedTxt = $string_or_dom;
465	$escapedTxt =~ s@\<@[@sg;
466	$escapedTxt =~ s@\>@]@sg;
467
468	print STDERR "#### $escapedTxt\n";
469	}
470
471	# Helper function to read in each paged_html generated by Xpdf's pdftohtml
472	# then modify the html suitably using the HTML parsing functions offered by
473	# Mojo::DOM, then return the modified HTML content as a string
474	# See https://mojolicious.org/perldoc/Mojo/DOM
475	sub _process_paged_html_page
476	{
477	my $self = shift (@_);
478	my ($pagefile, $page_num, $num_html_pages) = @_;
479
480	my $text = "";
481
482	# handling content encoding the same way default_convert_post_process does
483	$self->read_file ($pagefile, "utf8", "", \$text);
484
485	my $dom = Mojo::DOM->new($text);
486
487	# $self->_debug_print_html($dom);
488
489	# there's a <style> element on the <html>, we need to shift it into the <div>
490	# tag that we'll be creating. We'll first slightly modify the <style> element
491	# store the first style element, which is the only one and in the <body>
492	# we'll later insert it as child of an all-encompassing div that we'll create
493	my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
494	# In the style tag, convert id style references to class style references
495	my $css_class = ".p".$page_num."f";
496	$page_style_tag_str =~ s@\#f@$css_class@sg;
497	my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
498	#$self->_debug_print_html($style_element);
499
500	# need to know the image's height to set the height of the surrounding
501	# div that's to replace this page's <body>:
502	my $img_height = $dom->find('img')->[0]{height};
503
504	# 2. Adjust the img#background src attribute to point to the pages subdir for imgs
505	# 3. Set that img tag's class=background, and change its id to background+$page_num
506	my $bg_img_tag=$dom->find('img#background')->[0];
507	my $img_src_str = $bg_img_tag->{src};
508	$img_src_str = "pages/$img_src_str";
509	$bg_img_tag->attr(src => $img_src_str); # reset
510	#$self->_debug_print_html($bg_img_tag);
511	# set both class and modified id attributes in one step:
512	$bg_img_tag->attr({class => "background", id => "background".$page_num});
513	#$self->_debug_print_html($bg_img_tag);
514
515	# get all the <span> nested inside <div class="txt"> elements and
516	# 1. set their class attr to be "p + page_num + id-of-the-span",
517	# 2. then delete the id, because the span ids have been reused when element
518	# ids ought to be unique. Which is why we set the modified ids to be the
519	# value of the class attribute instead
520	$dom->find('div.txt span')->each(sub {
521	$_->attr(class => "p". $page_num. $_->{id});
522	delete $_->{id};
523	}); # both changes done in one find() operation
524	#$self->_debug_print_html($dom->find('div.txt span')->last);
525
526	# Finally can create our new dom, starting with a div tag for the current page
527	# Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
528	# my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
529	my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
530	#$self->_debug_print_html($new_dom);
531	$new_dom->at('div')->append_content($style_element)->root;
532
533
534	#$self->_debug_print_html($new_dom);
535	# Copy across all the old html's body tag's child nodes into the new dom's new div tag
536	$dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
537	#$self->_debug_print_html($new_dom);
538
539
540	# build up the outer div with the <h>tags for sectionalising
541	my $inner_div_str = $new_dom->to_string;
542
543	my $page_div = "<div id=\"page".$page_num."\">\n";
544	# Append a page range bucket heading if applicable: if we have more than 10 pages
545	# to display in the current bucket AND we're on the first page of each bucket of 10 pages.
546	# Dr Bainbridge thinks for now we need only consider PDFs where the
547	# total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
548	# If number of remaining pages >= 10, then create new bucket heading
549	# e.g. "Pages 30-40"
550	if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
551	# Double-digit page numbers that start with 2
552	# i.e. 21 to 29 (and 30) should be in 21 to 30 range
553	my $start_range = $page_num - ($page_num % 10) + 1;
554	my $end_range = $page_num + 10 - ($page_num % 10);
555	$page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
556	}
557
558	# No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
559	if($num_html_pages > 10) {
560	# Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
561	$page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";
562	}
563
564	$page_div .= $inner_div_str;
565	$page_div .= "\n</div>";
566
567	# Finished processing a single html page of the paged_html output generated by
568	# Xpdf's pdftohtml: finished massaging that single html page into the right form
569	return $page_div;
570	}
571
572	# This subroutine is called to do the PDFPlugin post-processing for all cases
573	# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
574	sub default_convert_post_process
575	{
576	my $self = shift (@_);
577	my ($conv_filename) = @_;
578	my $outhandle=$self->{'outhandle'};
579
580	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
581	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
582
583	# read in file ($text will be in utf8)
584	my $text = "";
585	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
586	$self->read_file ($conv_filename, "utf8", "", \$text);
587
588	# To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
589	# sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
590	# which it then splits on to generate page-based sections. However, that's not what PDFBox
591	# generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
592	# embeds each page in an extra div. The div opener is:
593	# <div style=\"page-break-before:always; page-break-after:always\">
594	# The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
595	# pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
596	# a regex substitution even with regex extensions on.) Later, when we process each section
597	# to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
598	# that increments the pagenum for each subsequent section.
599
600	#$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
601	my $loopcounter = 0; # used later on!
602	$text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
603
604
605	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
606	# for each page). Metadata based on this calculation not set until process()
607	#
608	# Note: this is done even if we are not breaking the document into pages as it might
609	# be useful to give an indication of document length in browser through setting
610	# num_pages as metadata.
611	# Clean html from low and hight surrogates D800âDFFF
612	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
613	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
614	my $num_pages = scalar(@pages);
615	$self->{'num_pages'} = $num_pages;
616
617	if ($self->{'use_sections'}
618	&& $self->{'converted_to'} eq "HTML") {
619
620	print $outhandle "PDFPlugin: Calculating sections...\n";
621
622	# we have "<a name=1></a>" etc for each page
623	# it may be <A name=
624	my @sections = split('<[Aa] name=', $text);
625
626	my $top_section = "";
627
628	if (scalar (@sections) == 1) { #only one section - no split!
629	print $outhandle "PDFPlugin: warning - no sections found\n";
630	} else {
631	$top_section .= shift @sections; # keep HTML header etc as top_section
632	}
633
634	# handle first section specially for title? Or all use first 100...
635
636	my $title = $sections[0];
637	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
638	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
639	$title =~ s/<[^>]*>/ /g;
640	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
641	$title =~ s/^\s+//s;
642	$title =~ s/\s+$//;
643	$title =~ s/\s+/ /gs;
644	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
645	$title =~ s/^\s+//s; # in case title_sub introduced any...
646	$title = substr ($title, 0, 100);
647	$title =~ s/\s\S*$/.../;
648
649
650	if (scalar (@sections) == 1) { # no sections found
651	$top_section .= $sections[0];
652	@sections=();
653	} else {
654	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
655	}
656
657	# add metadata per section...
658	foreach my $section (@sections) {
659	# section names are not always just digits, may be like "outline"
660	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
661
662	$title = $1; # Greenstone does magic if sections are titled digits
663
664	# A title of pagenum=0 means use_sections is being applied on output from PDFBox,
665	# which didn't originally have a <a name=incremented pagenumber></a> to split each page.
666	# Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
667	if($loopcounter > 0 \|\| ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
668	$title = ++$loopcounter;
669	}
670
671	if (! defined($title) ) {
672	print STDERR "no title: $section\n";
673	$title = " "; # get rid of the undefined warning in next line
674	}
675	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
676	$newsection .= "<Metadata name=\"Title\">" . $title
677	. "</Metadata>\n--><br />\n";
678	$newsection .= $section;
679	$newsection .= "<!--</Section>-->\n";
680	$section = $newsection;
681	}
682
683	$text=join('', ($top_section, @sections));
684	}
685
686	if ($self->{'use_sections'}
687	&& $self->{'converted_to'} eq "text") {
688	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
689	}
690
691
692	# The following should no longer be needed, now that strings
693	# read in are Unicode aware (in the Perl sense) rather than
694	# raw binary strings that just happen to be UTF-8 compliant
695
696	# turn any high bytes that aren't valid utf-8 into utf-8.
697	## unicode::ensure_utf8(\$text);
698
699	# Write it out again!
700	$self->utf8_write_file (\$text, $conv_filename);
701	}
702
703
704	# do plugin specific processing of doc_obj for HTML type
705	sub process {
706	my $self = shift (@_);
707	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
708
709	my $result = $self->process_type($base_dir,$file,$doc_obj);
710
711	# fix up the extracted date metadata to be in Greenstone date format,
712	# and fix the capitalisation of 'date'
713	my $cursection = $doc_obj->get_top_section();
714	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
715	$doc_obj->delete_metadata($cursection, "date", $datemeta);
716
717	# We're just interested in the date bit, not the time
718	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
719	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
720	# extracts the ModDate, so it is 0...
721	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
722	my ($year, $month, $day) = ($1,$2,$3);
723	if (defined($year) && defined($month) && defined($day)) {
724	if ($year == 0) {next}
725	if ($year < 100) {$year += 1900} # just to be safe
726	if ($month =~ /^\d$/) {$month="0$month"} # single digit
727	if ($day =~ /^\d$/) {$day="0$day"} # single digit
728	my $date="$year$month$day";
729	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
730	}
731	}
732
733	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
734
735	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
736	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
737	# right if any section has an empty title, or one with letters in it
738	if (&util::is_gs3()) {
739	# but for gs3, paged docs currently use image slider which is ugly if there are no images
740	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
741	} else {
742	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
743	}
744	}
745
746	return $result;
747	}
748
749	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: