Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 32215

Last change on this file since 32215 was 32215, checked in by ak19, 6 years ago
Before reorganising our PDFPlugin in whatever way we ultimately decide, committing a version where, on paged_html output mode, the pages produced by Xpdf's pdftohtml are sectionalised by default if total num pages is more than 10. Also changing inserted HTML heading tags to get the page title to still appear correctly.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.5 KB

Rev	Line
[1410]	1	###########################################################################
	2	#
[15872]	3	# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[2661]	8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[15872]	25	package PDFPlugin;
[1410]	26
[10353]	27	use strict;
[22702]	28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
[32205]	29	no strict 'subs'; # allow filehandles to be variables and viceversa
[1410]	30
[22705]	31	use ReadTextFile;
	32	use unicode;
[32205]	33	use Mojo::DOM; # for HTML parsing
[22702]	34
[22861]	35	use AutoLoadConverters;
[22864]	36	use ConvertBinaryFile;
[1410]	37
[22861]	38	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
[22705]	39
	40
[10452]	41	my $convert_to_list =
	42	[ { 'name' => "auto",
[15872]	43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]	44	{ 'name' => "html",
[15872]	45	'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]	46	{ 'name' => "text",
[15872]	47	'desc' => "{ConvertBinaryFile.convert_to.text}" },
[32205]	48	{ 'name' => "paged_html",
	49	'desc' => "{PDFPlugin.convert_to.paged_html}"},
[10452]	50	{ 'name' => "pagedimg_jpg",
[15872]	51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]	52	{ 'name' => "pagedimg_gif",
[15872]	53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]	54	{ 'name' => "pagedimg_png",
[15872]	55	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]	56	];
	57
	58
	59	my $arguments =
[10889]	60	[
	61	{ 'name' => "convert_to",
[15872]	62	'desc' => "{ConvertBinaryFile.convert_to}",
[10889]	63	'type' => "enum",
	64	'reqd' => "yes",
	65	'list' => $convert_to_list,
	66	'deft' => "html" },
	67	{ 'name' => "process_exp",
[31492]	68	'desc' => "{BaseImporter.process_exp}",
[10889]	69	'type' => "regexp",
	70	'deft' => &get_default_process_exp(),
	71	'reqd' => "no" },
	72	{ 'name' => "block_exp",
[31494]	73	'desc' => "{CommonUtil.block_exp}",
[10889]	74	'type' => "regexp",
	75	'deft' => &get_default_block_exp() },
	76	{ 'name' => "metadata_fields",
[15872]	77	'desc' => "{HTMLPlugin.metadata_fields}",
[10889]	78	'type' => "string",
[24431]	79	'deft' => "Title,Author,Subject,Keywords" },
[21800]	80	{ 'name' => "metadata_field_separator",
	81	'desc' => "{HTMLPlugin.metadata_field_separator}",
	82	'type' => "string",
	83	'deft' => "" },
[10889]	84	{ 'name' => "noimages",
[15872]	85	'desc' => "{PDFPlugin.noimages}",
[10889]	86	'type' => "flag" },
	87	{ 'name' => "allowimagesonly",
[15872]	88	'desc' => "{PDFPlugin.allowimagesonly}",
[10889]	89	'type' => "flag" },
	90	{ 'name' => "complex",
[15872]	91	'desc' => "{PDFPlugin.complex}",
[10889]	92	'type' => "flag" },
	93	{ 'name' => "nohidden",
[15872]	94	'desc' => "{PDFPlugin.nohidden}",
[10889]	95	'type' => "flag" },
	96	{ 'name' => "zoom",
[15872]	97	'desc' => "{PDFPlugin.zoom}",
[10889]	98	'deft' => "2",
	99	'range' => "1,3", # actually the range is 0.5-3
	100	'type' => "int" },
	101	{ 'name' => "use_sections",
[15872]	102	'desc' => "{PDFPlugin.use_sections}",
[10889]	103	'type' => "flag" },
	104	{ 'name' => "description_tags",
[15872]	105	'desc' => "{HTMLPlugin.description_tags}",
[29101]	106	'type' => "flag" },
	107	{ 'name' => "use_realistic_book",
[29102]	108	'desc' => "{PDFPlugin.use_realistic_book}",
[29101]	109	'type' => "flag"}
[10889]	110	];
[3540]	111
[15872]	112	my $options = { 'name' => "PDFPlugin",
	113	'desc' => "{PDFPlugin.desc}",
[6408]	114	'abstract' => "no",
[3540]	115	'inherits' => "yes",
[15114]	116	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
[3540]	117	'args' => $arguments };
	118
[1410]	119	sub new {
[10218]	120	my ($class) = shift (@_);
	121	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	122	push(@$pluginlist, $class);
[2452]	123
[10218]	124	push(@$inputargs,"-title_sub");
	125	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]	126
[15872]	127	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	128	push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]	129
[22861]	130	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
	131	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[31492]	132	my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
[10353]	133
[10580]	134	if ($self->{'info_only'}) {
	135	# don't worry about any options etc
	136	return bless $self, $class;
	137	}
[22861]	138
	139	$self = bless $self, $class;
[15872]	140	$self->{'file_type'} = "PDF";
	141
	142	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]	143	my $zoom = $self->{"zoom"};
[3720]	144	$self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]	145	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
	146	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
	147	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]	148	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]	149
[22597]	150	# check convert_to
[32205]	151	# TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
[22597]	152	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
	153	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
	154	$self->{'convert_to'} = "html";
[10273]	155	}
[22597]	156	elsif ($self->{'convert_to'} eq "auto") {
	157	# choose html ?? is this the best option
	158	$self->{'convert_to'} = "html";
[10273]	159	}
[29101]	160	if ($self->{'use_realistic_book'}) {
	161	if ($self->{'convert_to'} ne "html") {
	162	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
	163	$self->{'convert_to'} = "html";
	164	}
	165	}
[22597]	166	# set convert_to_plugin and convert_to_ext
[22702]	167	$self->set_standard_convert_settings();
[18145]	168
[22597]	169	my $secondary_plugin_name = $self->{'convert_to_plugin'};
	170	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10273]	171
[22597]	172	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
	173	$secondary_plugin_options->{$secondary_plugin_name} = [];
[10724]	174	}
[22597]	175	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10429]	176
[10273]	177	# following title_sub removes "Page 1" added by pdftohtml, and a leading
	178	# "1", which is often the page number at the top of the page. Bad Luck
	179	# if your document title actually starts with "1 " - is there a better way?
[22597]	180	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]	181	my $associate_tail_re = $self->{'associate_tail_re'};
	182	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[22597]	183	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
[11122]	184	}
[22597]	185	push(@$specific_options, "-file_rename_method", "none");
	186
	187	if ($secondary_plugin_name eq "HTMLPlugin") {
[22861]	188	# pdftohtml always produces utf8 - What about pdfbox???
[24290]	189	# push(@$specific_options, "-input_encoding", "utf8");
[22597]	190	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
	191	push(@$specific_options, "-processing_tmp_files");
	192	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
	193	# to extract these metadata fields from the HEAD META fields
	194	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
	195	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
	196	} else {
	197	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
	198	}
	199	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
	200	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
	201	}
	202	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
	203	$self->{'description_tags'} = 1;
	204	push(@$specific_options, "-description_tags");
	205	}
[29101]	206	if ($self->{'use_realistic_book'}) {
	207	push(@$specific_options, "-use_realistic_book");
	208	}
[32210]	209	if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section
	210	push(@$specific_options, "sectionalise_using_h_tags");
	211	}
[22597]	212	}
	213	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
	214	push(@$specific_options, "-screenviewsize", "1000");
	215	push(@$specific_options, "-enable_cache");
	216	push(@$specific_options, "-processing_tmp_files");
	217	}
[11122]	218
[10273]	219	$self = bless $self, $class;
[10429]	220	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]	221	return $self;
[1410]	222	}
	223
	224	sub get_default_process_exp {
	225	my $self = shift (@_);
	226
	227	return q^(?i)\.pdf$^;
	228	}
[2661]	229
	230	# so we don't inherit HTMLPlug's block exp...
	231	sub get_default_block_exp {
	232	return "";
	233	}
[22861]	234
	235	sub init {
	236	my $self = shift (@_);
	237
	238	# ConvertBinaryFile init
	239	$self->SUPER::init(@_);
[23754]	240	$self->AutoLoadConverters::init(@_);
[22861]	241
	242	}
	243
	244	sub begin {
	245	my $self = shift (@_);
	246
[23754]	247	$self->AutoLoadConverters::begin(@_);
[22861]	248	$self->SUPER::begin(@_);
	249
	250	}
	251
	252	sub deinit {
	253	my $self = shift (@_);
[1410]	254
[23754]	255	$self->AutoLoadConverters::deinit(@_);
[22861]	256	$self->SUPER::deinit(@_);
	257
	258	}
	259
[24290]	260	# By setting hashing to be on ga xml this ensures that two
	261	# PDF files that are identical except for the metadata
	262	# to hash to different values. Without this, when each PDF
	263	# file is converted to HTML there is a chance that they
	264	# will both be identical if the conversion utility does
	265	# not embed the metadata in the generated HTML. This is
	266	# certainly the case when PDFBOX is being used.
[22861]	267
[24290]	268	# This change makes this convert to based plugin more
	269	# consistent with the original vision that the same document
	270	# with different metadata should
	271	# be seen as different.
	272
	273	sub get_oid_hash_type {
	274	my $self = shift (@_);
	275	return "hash_on_ga_xml";
	276	}
	277
	278
[22861]	279	sub tmp_area_convert_file {
	280
	281	my $self = shift (@_);
	282	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
	283
	284	}
	285
[32206]	286	# Overriding to do some extra handling for paged_html output mode
	287	sub run_conversion_command {
	288	my $self = shift (@_);
	289	my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
	290
	291	if($self->{'convert_to'} ne "paged_html") {
	292	return $self->ConvertBinaryFile::run_conversion_command(@_);
	293	}
	294
	295	# if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
	296	# to create a subdir called "pages" in the tmp area to puts its products
	297	# in there. (Xpdf's pdftohtml needs to be passed a non-existent directory
	298	# parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
	299	# the intermediary output file tmp/<random-num>/pages/index.html should
	300	# exist (besides other output products there)
	301
	302	# We let ConvertBinaryFile proceed normally, but the return value should reflect
	303	# that on success it should expect the intermediary product tmpdir/pages/index.html
	304	# (which is the product of xpdftohtml conversion).
	305	my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
	306	$output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
	307
	308	# However, when convert_post_process() is done, it should have output the final
	309	# product of the paged_html conversion: an html file of the same name and in the
	310	# same tmp location as the input PDF file.
	311
	312	my ($name_prefix, $output_dir, $ext)
	313	= &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
	314	$self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
	315	# print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
	316
	317	return $output_filename;
	318	}
	319
[10273]	320	sub convert_post_process
	321	{
[1410]	322	my $self = shift (@_);
[10273]	323	my ($conv_filename) = @_;
[9465]	324
[7019]	325	my $outhandle=$self->{'outhandle'};
	326
[32206]	327	if($self->{'convert_to'} eq "paged_html") {
[32205]	328	# special post-processing for paged_html mode, as HTML pages generated
	329	# by xpdf's pdftohtml need to be massaged into the form we want
	330	$self->xpdftohtml_convert_post_process($conv_filename);
	331	}
	332	else { # use PDFPlugin's usual post processing
	333	$self->default_convert_post_process($conv_filename);
	334	}
	335	}
	336
	337	# Called after gsConvert.pl has been run to convert a PDF to paged_html
	338	# using Xpdftools' pdftohtml
	339	# This method will do some cleanup of the HTML files produced after XPDF has produced
	340	# an HTML doc for each PDF page: it first gets rid of the default index.html.
	341	# Instead, it constructs a single html page containing each original HTML page
	342	# <body> nested as divs instead, with simple section information inserted at the top
	343	# of each 'page' <div> and some further styling customisation. This HTML manipulation
	344	# is to be done with the Mojo::DOM perl package.
	345	# Note that since xpdf's pdftohtml would have failed if the output dir already
	346	# existed and for simpler naming, the output files are created in a new "pages"
	347	# subdirectory of the tmp location parent of $conv_filename instead
	348	sub xpdftohtml_convert_post_process
	349	{
	350	my $self = shift (@_);
[32206]	351	my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
	352	my $output_filename = $self->{'conv_filename_after_post_process'};
	353
	354	# Read in all the html files in tmp's "pages" subdir, except for index.html.
	355	# and use it to create a new html file called $self->{'conv_filename_after_post_process'}
	356	# which will consist of a slightly modified version of
[32205]	357	# each of the other html files concatenated together.
	358
	359	my $outhandle=$self->{'outhandle'};
	360
[32206]	361	my ($tailname, $pages_subdir, $suffix)
	362	= &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
[32205]	363
	364	# Code from util::create_itemfile()
	365	# Read in all the files
	366	opendir(DIR, $pages_subdir) \|\| die "can't opendir $pages_subdir: $!";
	367	my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
	368	closedir DIR;
	369	# Sort files in the directory by page_num
	370	# files are named index.html, page1.html, page2.html, ..., pagen.html
	371	sub page_number {
	372	my ($dir) = @_;
	373	my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
	374	$pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
	375	return $pagenum;
	376	}
	377	# sort the files in the directory in the order of page_num rather than lexically.
	378	@page_files = sort { page_number($a) <=> page_number($b) } @page_files;
	379
	380	#my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
	381	# For every html file there's an img file, so halve the total num.
	382	# What about other file types that may potentially be there too???
	383	my $num_html_pages = 0;
	384	foreach my $pagefile (@page_files) {
	385	$num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
	386	}
	387
	388	# Prepare to create our new html page that will contain all the individual
	389	# htmls generated by xpdf's pdftohtml in sequence.
	390	# First write the opening html tags out to the output file. These are the
	391	# same tags and their contents, including <meta>, as is generated by
	392	# Xpdf's pdftohtml for each of its individual html pages.
	393	my $start_text = "<html>\n<head>\n";
[32206]	394	my ($output_tailname, $tmp_subdir, $html_suffix)
	395	= &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
	396	$start_text .= "<title>$output_tailname</title>\n";
[32205]	397	$start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
	398	$start_text .= "</head>\n<body>\n\n";
[32215]	399	$start_text .= "<h1>$output_tailname</h1>\n\n";
[32205]	400
	401	#handle content encodings the same way that default_convert_post_process does
	402	# $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
	403	# Don't want to build a giant string in memory of all the pages concatenated
	404	# and then write it out in one go. Instead, build up the final single page
	405	# by writing each modified paged_html file out to it as this is processed.
	406	# Copying file open/close code from CommonUtil::utf8_write_file()
	407	if (!open (OUTFILE, ">:utf8", $output_filename)) {
	408	gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);
	409	die "\n";
	410	}
	411	print OUTFILE $start_text;
	412
	413	# Get the contents of each individual HTML page generated by Xpdf, after first
	414	# modifying each, and write each out into our single all-encompassing html
	415	foreach my $pagefile (@page_files) {
	416	if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
	417	my $page_num = page_number($pagefile);
	418	# get full path to pagefile
	419	$pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
	420	# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
	421	my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
	422	print OUTFILE "$modified_page_contents\n\n";
	423	}
	424	}
	425
	426	# we've now created a single HTML file by concatenating (a modified version)
	427	# of each paged html file
	428	print OUTFILE "</body>\n</html>\n"; # write out closing tags
	429	close OUTFILE; # done
	430
	431	# Get rid of all the htm(l) files incl index.html in the associated "pages"
	432	# subdir, since we've now processed them all into a single html file
	433	# one folder level up and we don't want HTMLPlugin to process all of them next.
	434	&FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
	435
	436	# now the tmp area should contain a single html file contain all the html pages'
	437	# contents in sequence, and a "pages" subdir containing the screenshot images
	438	# of each page.
	439	# HTMLPlugin will process these further in the plugin pipeline
	440	}
	441
	442	# For whatever reason, most html <tags> don't get printed out in GLI
	443	# So when debugging, use this function to print them out as [tags] instead.
	444	sub _debug_print_html
	445	{
	446	my $self = shift (@_);
	447	my ($string_or_dom) = @_;
	448
	449	# can't seem to determine type of string with ref/reftype
	450	# https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
[32206]	451	# Not needed, as $dom objects seem to get correctly stringified in string contexts
[32205]	452	# $dom.to_string/$dom.stringify seem to get called, no need to call them
	453	# https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
	454	my $escapedTxt = $string_or_dom;
	455	$escapedTxt =~ s@\<@[@sg;
	456	$escapedTxt =~ s@\>@]@sg;
	457
	458	print STDERR "#### $escapedTxt\n";
	459	}
	460
	461	# Helper function to read in each paged_html generated by Xpdf's pdftohtml
	462	# then modify the html suitably using the HTML parsing functions offered by
	463	# Mojo::DOM, then return the modified HTML content as a string
	464	# See https://mojolicious.org/perldoc/Mojo/DOM
	465	sub _process_paged_html_page
	466	{
	467	my $self = shift (@_);
	468	my ($pagefile, $page_num, $num_html_pages) = @_;
	469
	470	my $text = "";
	471
	472	# handling content encoding the same way default_convert_post_process does
	473	$self->read_file ($pagefile, "utf8", "", \$text);
	474
	475	my $dom = Mojo::DOM->new($text);
	476
	477	# $self->_debug_print_html($dom);
	478
	479	# there's a <style> element on the <html>, we need to shift it into the <div>
	480	# tag that we'll be creating. We'll first slightly modify the <style> element
	481	# store the first style element, which is the only one and in the <body>
	482	# we'll later insert it as child of an all-encompassing div that we'll create
	483	my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
	484	# In the style tag, convert id style references to class style references
	485	my $css_class = ".p".$page_num."f";
	486	$page_style_tag_str =~ s@\#f@$css_class@sg;
	487	my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
	488	#$self->_debug_print_html($style_element);
	489
	490	# need to know the image's height to set the height of the surrounding
	491	# div that's to replace this page's <body>:
	492	my $img_height = $dom->find('img')->[0]{height};
	493
	494	# 2. Adjust the img#background src attribute to point to the pages subdir for imgs
	495	# 3. Set that img tag's class=background, and change its id to background+$page_num
	496	my $bg_img_tag=$dom->find('img#background')->[0];
	497	my $img_src_str = $bg_img_tag->{src};
	498	$img_src_str = "pages/$img_src_str";
[32206]	499	$bg_img_tag->attr(src => $img_src_str); # reset
[32205]	500	#$self->_debug_print_html($bg_img_tag);
	501	# set both class and modified id attributes in one step:
	502	$bg_img_tag->attr({class => "background", id => "background".$page_num});
	503	#$self->_debug_print_html($bg_img_tag);
	504
	505	# get all the <span> nested inside <div class="txt"> elements and
	506	# 1. set their class attr to be "p + page_num + id-of-the-span",
	507	# 2. then delete the id, because the span ids have been reused when element
	508	# ids ought to be unique. Which is why we set the modified ids to be the
	509	# value of the class attribute instead
	510	$dom->find('div.txt span')->each(sub {
	511	$_->attr(class => "p". $page_num. $_->{id});
	512	delete $_->{id};
	513	}); # both changes done in one find() operation
	514	#$self->_debug_print_html($dom->find('div.txt span')->last);
	515
	516	# Finally can create our new dom, starting with a div tag for the current page
	517	# Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
[32206]	518	# my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
	519	my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
[32205]	520	#$self->_debug_print_html($new_dom);
	521	$new_dom->at('div')->append_content($style_element)->root;
	522
[32206]	523
	524	#$self->_debug_print_html($new_dom);
	525	# Copy across all the old html's body tag's child nodes into the new dom's new div tag
	526	$dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
	527	#$self->_debug_print_html($new_dom);
	528
	529
	530	# build up the outer div with the <h>tags for sectionalising
	531	my $inner_div_str = $new_dom->to_string;
	532
	533	my $page_div = "<div id=\"page".$page_num."\">\n";
	534	# Append a page range bucket heading if applicable: if we have more than 10 pages
	535	# to display in the current bucket AND we're on the first page of each bucket of 10 pages.
[32205]	536	# Dr Bainbridge thinks for now we need only consider PDFs where the
	537	# total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
	538	# If number of remaining pages >= 10, then create new bucket heading
	539	# e.g. "Pages 30-40"
[32206]	540	if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
[32205]	541	# Double-digit page numbers that start with 2
	542	# i.e. 21 to 29 (and 30) should be in 21 to 30 range
	543	my $start_range = $page_num - ($page_num % 10) + 1;
	544	my $end_range = $page_num + 10 - ($page_num % 10);
[32215]	545	$page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
[32205]	546	}
	547
[32215]	548	# No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
	549	if($num_html_pages > 10) {
	550	# Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
	551	$page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";
	552	}
[32205]	553
[32206]	554	$page_div .= $inner_div_str;
	555	$page_div .= "\n</div>";
	556
[32205]	557	# Finished processing a single html page of the paged_html output generated by
	558	# Xpdf's pdftohtml: finished massaging that single html page into the right form
[32206]	559	return $page_div;
[32205]	560	}
	561
	562	# This subroutine is called to do the PDFPlugin post-processing for all cases
	563	# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
	564	sub default_convert_post_process
	565	{
	566	my $self = shift (@_);
	567	my ($conv_filename) = @_;
	568	my $outhandle=$self->{'outhandle'};
	569
[15963]	570	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
	571	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]	572
[10273]	573	# read in file ($text will be in utf8)
	574	my $text = "";
[15963]	575	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
	576	$self->read_file ($conv_filename, "utf8", "", \$text);
[10273]	577
[24159]	578	# To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
	579	# sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
	580	# which it then splits on to generate page-based sections. However, that's not what PDFBox
	581	# generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
	582	# embeds each page in an extra div. The div opener is:
	583	# <div style=\"page-break-before:always; page-break-after:always\">
[24476]	584	# The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
[24159]	585	# pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
	586	# a regex substitution even with regex extensions on.) Later, when we process each section
	587	# to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
	588	# that increments the pagenum for each subsequent section.
	589
	590	#$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
	591	my $loopcounter = 0; # used later on!
	592	$text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
	593
	594
[10273]	595	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
	596	# for each page). Metadata based on this calculation not set until process()
	597	#
[24476]	598	# Note: this is done even if we are not breaking the document into pages as it might
[10273]	599	# be useful to give an indication of document length in browser through setting
	600	# num_pages as metadata.
[30491]	601	# Clean html from low and hight surrogates D800âDFFF
[30492]	602	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
[24476]	603	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
[10273]	604	my $num_pages = scalar(@pages);
	605	$self->{'num_pages'} = $num_pages;
	606
[3411]	607	if ($self->{'use_sections'}
	608	&& $self->{'converted_to'} eq "HTML") {
	609
[15872]	610	print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]	611
[3614]	612	# we have "<a name=1></a>" etc for each page
[8795]	613	# it may be <A name=
[10273]	614	my @sections = split('<[Aa] name=', $text);
[3411]	615
[10273]	616	my $top_section = "";
	617
[7019]	618	if (scalar (@sections) == 1) { #only one section - no split!
[15872]	619	print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]	620	} else {
[10273]	621	$top_section .= shift @sections; # keep HTML header etc as top_section
[7019]	622	}
	623
[3411]	624	# handle first section specially for title? Or all use first 100...
	625
	626	my $title = $sections[0];
[8795]	627	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]	628	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	629	$title =~ s/<[^>]*>/ /g;
	630	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	631	$title =~ s/^\s+//s;
	632	$title =~ s/\s+$//;
	633	$title =~ s/\s+/ /gs;
	634	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	635	$title =~ s/^\s+//s; # in case title_sub introduced any...
	636	$title = substr ($title, 0, 100);
	637	$title =~ s/\s\S*$/.../;
	638
[10273]	639
[7019]	640	if (scalar (@sections) == 1) { # no sections found
[10273]	641	$top_section .= $sections[0];
[7019]	642	@sections=();
	643	} else {
[10273]	644	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]	645	}
[3411]	646
	647	# add metadata per section...
	648	foreach my $section (@sections) {
[8795]	649	# section names are not always just digits, may be like "outline"
	650	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]	651
[3411]	652	$title = $1; # Greenstone does magic if sections are titled digits
[24159]	653
	654	# A title of pagenum=0 means use_sections is being applied on output from PDFBox,
	655	# which didn't originally have a <a name=incremented pagenumber></a> to split each page.
	656	# Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
	657	if($loopcounter > 0 \|\| ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
	658	$title = ++$loopcounter;
	659	}
	660
[3411]	661	if (! defined($title) ) {
	662	print STDERR "no title: $section\n";
[8795]	663	$title = " "; # get rid of the undefined warning in next line
[3411]	664	}
[15872]	665	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]	666	$newsection .= "<Metadata name=\"Title\">" . $title
[24159]	667	. "</Metadata>\n--><br />\n";
[3411]	668	$newsection .= $section;
	669	$newsection .= "<!--</Section>-->\n";
	670	$section = $newsection;
	671	}
	672
[10273]	673	$text=join('', ($top_section, @sections));
[3411]	674	}
	675
[24199]	676	if ($self->{'use_sections'}
	677	&& $self->{'converted_to'} eq "text") {
	678	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
	679	}
[22953]	680
[24199]	681
[22953]	682	# The following should no longer be needed, now that strings
	683	# read in are Unicode aware (in the Perl sense) rather than
	684	# raw binary strings that just happen to be UTF-8 compliant
	685
[8218]	686	# turn any high bytes that aren't valid utf-8 into utf-8.
[22953]	687	## unicode::ensure_utf8(\$text);
[8218]	688
[10273]	689	# Write it out again!
	690	$self->utf8_write_file (\$text, $conv_filename);
	691	}
[7287]	692
	693
[10273]	694	# do plugin specific processing of doc_obj for HTML type
	695	sub process {
	696	my $self = shift (@_);
[15872]	697	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]	698
[15963]	699	my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]	700
[8226]	701	# fix up the extracted date metadata to be in Greenstone date format,
	702	# and fix the capitalisation of 'date'
[8227]	703	my $cursection = $doc_obj->get_top_section();
	704	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]	705	$doc_obj->delete_metadata($cursection, "date", $datemeta);
	706
	707	# We're just interested in the date bit, not the time
[8278]	708	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
	709	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
	710	# extracts the ModDate, so it is 0...
	711	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
	712	my ($year, $month, $day) = ($1,$2,$3);
	713	if (defined($year) && defined($month) && defined($day)) {
	714	if ($year == 0) {next}
	715	if ($year < 100) {$year += 1900} # just to be safe
	716	if ($month =~ /^\d$/) {$month="0$month"} # single digit
	717	if ($day =~ /^\d$/) {$day="0$day"} # single digit
	718	my $date="$year$month$day";
	719	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	720	}
[7287]	721	}
	722
[24476]	723	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
[8795]	724
	725	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
[30742]	726	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
[8795]	727	# right if any section has an empty title, or one with letters in it
[30742]	728	if (&util::is_gs3()) {
	729	# but for gs3, paged docs currently use image slider which is ugly if there are no images
	730	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
	731	} else {
	732	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
	733	}
[8795]	734	}
[10273]	735
[7287]	736	return $result;
[1410]	737	}
	738
	739	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: