Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago
First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.
Property svn:keywords set to `Author Date Id Revision`
File size: 28.4 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use strict;
28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29	no strict 'subs'; # allow filehandles to be variables and viceversa
30
31	use ReadTextFile;
32	use unicode;
33	use Mojo::DOM; # for HTML parsing
34
35	use AutoLoadConverters;
36	use ConvertBinaryFile;
37
38	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
39
40
41	my $convert_to_list =
42	[ { 'name' => "auto",
43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44	{ 'name' => "html",
45	'desc' => "{ConvertBinaryFile.convert_to.html}" },
46	{ 'name' => "text",
47	'desc' => "{ConvertBinaryFile.convert_to.text}" },
48	{ 'name' => "paged_html",
49	'desc' => "{PDFPlugin.convert_to.paged_html}"},
50	{ 'name' => "pagedimg_jpg",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
52	{ 'name' => "pagedimg_gif",
53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
54	{ 'name' => "pagedimg_png",
55	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
56	];
57
58
59	my $arguments =
60	[
61	{ 'name' => "convert_to",
62	'desc' => "{ConvertBinaryFile.convert_to}",
63	'type' => "enum",
64	'reqd' => "yes",
65	'list' => $convert_to_list,
66	'deft' => "html" },
67	{ 'name' => "process_exp",
68	'desc' => "{BaseImporter.process_exp}",
69	'type' => "regexp",
70	'deft' => &get_default_process_exp(),
71	'reqd' => "no" },
72	{ 'name' => "block_exp",
73	'desc' => "{CommonUtil.block_exp}",
74	'type' => "regexp",
75	'deft' => &get_default_block_exp() },
76	{ 'name' => "metadata_fields",
77	'desc' => "{HTMLPlugin.metadata_fields}",
78	'type' => "string",
79	'deft' => "Title,Author,Subject,Keywords" },
80	{ 'name' => "metadata_field_separator",
81	'desc' => "{HTMLPlugin.metadata_field_separator}",
82	'type' => "string",
83	'deft' => "" },
84	{ 'name' => "noimages",
85	'desc' => "{PDFPlugin.noimages}",
86	'type' => "flag" },
87	{ 'name' => "allowimagesonly",
88	'desc' => "{PDFPlugin.allowimagesonly}",
89	'type' => "flag" },
90	{ 'name' => "complex",
91	'desc' => "{PDFPlugin.complex}",
92	'type' => "flag" },
93	{ 'name' => "nohidden",
94	'desc' => "{PDFPlugin.nohidden}",
95	'type' => "flag" },
96	{ 'name' => "zoom",
97	'desc' => "{PDFPlugin.zoom}",
98	'deft' => "2",
99	'range' => "1,3", # actually the range is 0.5-3
100	'type' => "int" },
101	{ 'name' => "use_sections",
102	'desc' => "{PDFPlugin.use_sections}",
103	'type' => "flag" },
104	{ 'name' => "description_tags",
105	'desc' => "{HTMLPlugin.description_tags}",
106	'type' => "flag" },
107	{ 'name' => "use_realistic_book",
108	'desc' => "{PDFPlugin.use_realistic_book}",
109	'type' => "flag"}
110	];
111
112	my $options = { 'name' => "PDFPlugin",
113	'desc' => "{PDFPlugin.desc}",
114	'abstract' => "no",
115	'inherits' => "yes",
116	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
117	'args' => $arguments };
118
119	sub new {
120	my ($class) = shift (@_);
121	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
122	push(@$pluginlist, $class);
123
124	push(@$inputargs,"-title_sub");
125	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
126
127	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
128	push(@{$hashArgOptLists->{"OptList"}},$options);
129
130	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
131	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
132	my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
133
134	if ($self->{'info_only'}) {
135	# don't worry about any options etc
136	return bless $self, $class;
137	}
138
139	$self = bless $self, $class;
140	$self->{'file_type'} = "PDF";
141
142	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
143	my $zoom = $self->{"zoom"};
144	$self->{'convert_options'} = "-pdf_zoom $zoom";
145	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
146	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
147	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
148	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
149
150	# check convert_to
151	# TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
152	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
153	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
154	$self->{'convert_to'} = "html";
155	}
156	elsif ($self->{'convert_to'} eq "auto") {
157	# choose html ?? is this the best option
158	$self->{'convert_to'} = "html";
159	}
160	if ($self->{'use_realistic_book'}) {
161	if ($self->{'convert_to'} ne "html") {
162	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
163	$self->{'convert_to'} = "html";
164	}
165	}
166	# set convert_to_plugin and convert_to_ext
167	$self->set_standard_convert_settings();
168
169	my $secondary_plugin_name = $self->{'convert_to_plugin'};
170	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
171
172	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
173	$secondary_plugin_options->{$secondary_plugin_name} = [];
174	}
175	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
176
177	# following title_sub removes "Page 1" added by pdftohtml, and a leading
178	# "1", which is often the page number at the top of the page. Bad Luck
179	# if your document title actually starts with "1 " - is there a better way?
180	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
181	my $associate_tail_re = $self->{'associate_tail_re'};
182	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
183	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
184	}
185	push(@$specific_options, "-file_rename_method", "none");
186
187	if ($secondary_plugin_name eq "HTMLPlugin") {
188	# pdftohtml always produces utf8 - What about pdfbox???
189	# push(@$specific_options, "-input_encoding", "utf8");
190	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
191	push(@$specific_options, "-processing_tmp_files");
192	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
193	# to extract these metadata fields from the HEAD META fields
194	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
195	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
196	} else {
197	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
198	}
199	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
200	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
201	}
202	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
203	$self->{'description_tags'} = 1;
204	push(@$specific_options, "-description_tags");
205	}
206	if ($self->{'use_realistic_book'}) {
207	push(@$specific_options, "-use_realistic_book");
208	}
209	}
210	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
211	push(@$specific_options, "-screenviewsize", "1000");
212	push(@$specific_options, "-enable_cache");
213	push(@$specific_options, "-processing_tmp_files");
214	}
215
216	$self = bless $self, $class;
217	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
218	return $self;
219	}
220
221	sub get_default_process_exp {
222	my $self = shift (@_);
223
224	return q^(?i)\.pdf$^;
225	}
226
227	# so we don't inherit HTMLPlug's block exp...
228	sub get_default_block_exp {
229	return "";
230	}
231
232	sub init {
233	my $self = shift (@_);
234
235	# ConvertBinaryFile init
236	$self->SUPER::init(@_);
237	$self->AutoLoadConverters::init(@_);
238
239	}
240
241	sub begin {
242	my $self = shift (@_);
243
244	$self->AutoLoadConverters::begin(@_);
245	$self->SUPER::begin(@_);
246
247	}
248
249	sub deinit {
250	my $self = shift (@_);
251
252	$self->AutoLoadConverters::deinit(@_);
253	$self->SUPER::deinit(@_);
254
255	}
256
257	# By setting hashing to be on ga xml this ensures that two
258	# PDF files that are identical except for the metadata
259	# to hash to different values. Without this, when each PDF
260	# file is converted to HTML there is a chance that they
261	# will both be identical if the conversion utility does
262	# not embed the metadata in the generated HTML. This is
263	# certainly the case when PDFBOX is being used.
264
265	# This change makes this convert to based plugin more
266	# consistent with the original vision that the same document
267	# with different metadata should
268	# be seen as different.
269
270	sub get_oid_hash_type {
271	my $self = shift (@_);
272	return "hash_on_ga_xml";
273	}
274
275
276	sub tmp_area_convert_file {
277
278	my $self = shift (@_);
279	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
280
281	}
282
283	sub convert_post_process
284	{
285	my $self = shift (@_);
286	my ($conv_filename) = @_;
287
288	my $outhandle=$self->{'outhandle'};
289	# print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n";
290
291	if($self->{'convert_to'} eq "paged_html") {
292	# special post-processing for paged_html mode, as HTML pages generated
293	# by xpdf's pdftohtml need to be massaged into the form we want
294	$self->xpdftohtml_convert_post_process($conv_filename);
295	}
296	else { # use PDFPlugin's usual post processing
297	$self->default_convert_post_process($conv_filename);
298	}
299	}
300
301	# Called after gsConvert.pl has been run to convert a PDF to paged_html
302	# using Xpdftools' pdftohtml
303	# This method will do some cleanup of the HTML files produced after XPDF has produced
304	# an HTML doc for each PDF page: it first gets rid of the default index.html.
305	# Instead, it constructs a single html page containing each original HTML page
306	# <body> nested as divs instead, with simple section information inserted at the top
307	# of each 'page' <div> and some further styling customisation. This HTML manipulation
308	# is to be done with the Mojo::DOM perl package.
309	# Note that since xpdf's pdftohtml would have failed if the output dir already
310	# existed and for simpler naming, the output files are created in a new "pages"
311	# subdirectory of the tmp location parent of $conv_filename instead
312	sub xpdftohtml_convert_post_process
313	{
314	my $self = shift (@_);
315	my ($output_filename) = @_; # output_filename = tmp location + filename
316	# if a single html were generated.
317	# We just want the tmp location, append "pages", and read all the html files
318	# in except for index.html. Then we create a new html file by name
319	# $output_filename, which will consist of a slightly modified version of
320	# each of the other html files concatenated together.
321
322	my $outhandle=$self->{'outhandle'};
323
324	my ($tailname, $tmp_dir, $suffix)
325	= &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
326	my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages");
327
328	# Code from util::create_itemfile()
329	# Read in all the files
330	opendir(DIR, $pages_subdir) \|\| die "can't opendir $pages_subdir: $!";
331	my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
332	closedir DIR;
333	# Sort files in the directory by page_num
334	# files are named index.html, page1.html, page2.html, ..., pagen.html
335	sub page_number {
336	my ($dir) = @_;
337	my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
338	$pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
339	return $pagenum;
340	}
341	# sort the files in the directory in the order of page_num rather than lexically.
342	@page_files = sort { page_number($a) <=> page_number($b) } @page_files;
343
344	#my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
345	# For every html file there's an img file, so halve the total num.
346	# What about other file types that may potentially be there too???
347	my $num_html_pages = 0;
348	foreach my $pagefile (@page_files) {
349	$num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
350	}
351
352	# Prepare to create our new html page that will contain all the individual
353	# htmls generated by xpdf's pdftohtml in sequence.
354	# First write the opening html tags out to the output file. These are the
355	# same tags and their contents, including <meta>, as is generated by
356	# Xpdf's pdftohtml for each of its individual html pages.
357	my $start_text = "<html>\n<head>\n";
358	$start_text .= "<title>$tailname</title>\n";
359	$start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
360	$start_text .= "</head>\n<body>\n\n";
361
362	#handle content encodings the same way that default_convert_post_process does
363	# $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
364	# Don't want to build a giant string in memory of all the pages concatenated
365	# and then write it out in one go. Instead, build up the final single page
366	# by writing each modified paged_html file out to it as this is processed.
367	# Copying file open/close code from CommonUtil::utf8_write_file()
368	if (!open (OUTFILE, ">:utf8", $output_filename)) {
369	gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);
370	die "\n";
371	}
372	print OUTFILE $start_text;
373
374	# Get the contents of each individual HTML page generated by Xpdf, after first
375	# modifying each, and write each out into our single all-encompassing html
376	foreach my $pagefile (@page_files) {
377	if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
378	my $page_num = page_number($pagefile);
379	# get full path to pagefile
380	$pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
381	# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
382	my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
383	print OUTFILE "$modified_page_contents\n\n";
384	}
385	}
386
387	# we've now created a single HTML file by concatenating (a modified version)
388	# of each paged html file
389	print OUTFILE "</body>\n</html>\n"; # write out closing tags
390	close OUTFILE; # done
391
392	# Get rid of all the htm(l) files incl index.html in the associated "pages"
393	# subdir, since we've now processed them all into a single html file
394	# one folder level up and we don't want HTMLPlugin to process all of them next.
395	# my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;
396	&FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
397
398	# now the tmp area should contain a single html file contain all the html pages'
399	# contents in sequence, and a "pages" subdir containing the screenshot images
400	# of each page.
401	# HTMLPlugin will process these further in the plugin pipeline
402	}
403
404	# For whatever reason, most html <tags> don't get printed out in GLI
405	# So when debugging, use this function to print them out as [tags] instead.
406	sub _debug_print_html
407	{
408	my $self = shift (@_);
409	my ($string_or_dom) = @_;
410
411	# can't seem to determine type of string with ref/reftype
412	# https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
413
414	# $dom objects appear to get correctly stringified in string contexts
415	# $dom.to_string/$dom.stringify seem to get called, no need to call them
416	# https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
417	my $escapedTxt = $string_or_dom;
418	$escapedTxt =~ s@\<@[@sg;
419	$escapedTxt =~ s@\>@]@sg;
420
421	print STDERR "#### $escapedTxt\n";
422	}
423
424	# Helper function to read in each paged_html generated by Xpdf's pdftohtml
425	# then modify the html suitably using the HTML parsing functions offered by
426	# Mojo::DOM, then return the modified HTML content as a string
427	# See https://mojolicious.org/perldoc/Mojo/DOM
428	sub _process_paged_html_page
429	{
430	my $self = shift (@_);
431	my ($pagefile, $page_num, $num_html_pages) = @_;
432
433	my $text = "";
434
435	# handling content encoding the same way default_convert_post_process does
436	$self->read_file ($pagefile, "utf8", "", \$text);
437
438	my $dom = Mojo::DOM->new($text);
439
440	# $self->_debug_print_html($dom);
441
442	# there's a <style> element on the <html>, we need to shift it into the <div>
443	# tag that we'll be creating. We'll first slightly modify the <style> element
444	# store the first style element, which is the only one and in the <body>
445	# we'll later insert it as child of an all-encompassing div that we'll create
446	# my $page_style_tag_str = $dom->find('style')->[0]->to_string;
447	# my $page_style_tag_str = $dom->find('html style')->[0]->to_string;
448	my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
449	# In the style tag, convert id style references to class style references
450	my $css_class = ".p".$page_num."f";
451	$page_style_tag_str =~ s@\#f@$css_class@sg;
452	my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified
453	#$self->_debug_print_html($style_element);
454
455	# need to know the image's height to set the height of the surrounding
456	# div that's to replace this page's <body>:
457	my $img_height = $dom->find('img')->[0]{height};
458
459
460	# 1. Fix up the style attr on the image by additionally setting z-index=-1 for it
461	# 2. Adjust the img#background src attribute to point to the pages subdir for imgs
462	# 3. Set that img tag's class=background, and change its id to background+$page_num
463	my $bg_img_tag=$dom->find('img#background')->[0];
464
465	my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}
466	$img_style_str = $img_style_str." z-index=-1;";
467	#print STDERR "img_style_str: " . $img_style_str."\n";
468	my $img_src_str = $bg_img_tag->{src};
469	$img_src_str = "pages/$img_src_str";
470	$bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset
471	#$self->_debug_print_html($bg_img_tag);
472	# set both class and modified id attributes in one step:
473	$bg_img_tag->attr({class => "background", id => "background".$page_num});
474	#$self->_debug_print_html($bg_img_tag);
475
476	# get all the <span> nested inside <div class="txt"> elements and
477	# 1. set their class attr to be "p + page_num + id-of-the-span",
478	# 2. then delete the id, because the span ids have been reused when element
479	# ids ought to be unique. Which is why we set the modified ids to be the
480	# value of the class attribute instead
481	$dom->find('div.txt span')->each(sub {
482	$_->attr(class => "p". $page_num. $_->{id});
483	delete $_->{id};
484	}); # both changes done in one find() operation
485	#$self->_debug_print_html($dom->find('div.txt span')->last);
486
487	# Finally can create our new dom, starting with a div tag for the current page
488	# Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
489	my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" );
490	#$self->_debug_print_html($new_dom);
491	$new_dom->at('div')->append_content($style_element)->root;
492
493	# Append a page range bucket heading if applicable
494	# Dr Bainbridge thinks for now we need only consider PDFs where the
495	# total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
496	# If number of remaining pages >= 10, then create new bucket heading
497	# e.g. "Pages 30-40"
498	if(($num_html_pages - $page_num) > 10) {
499	# Double-digit page numbers that start with 2
500	# i.e. 21 to 29 (and 30) should be in 21 to 30 range
501	my $start_range = $page_num - ($page_num % 10) + 1;
502	my $end_range = $page_num + 10 - ($page_num % 10);
503	if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range
504	$start_range -= 10;
505	$end_range -= 10;
506	}
507	$new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root;
508	}
509
510	# Add a simpler heading: just the pagenumber, "Page #"
511	$new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root;
512	#$self->_debug_print_html($new_dom);
513	# Copy across all the old html's body tag's child nodes into the new dom's new div tag
514	$dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
515	#$self->_debug_print_html($new_dom);
516
517	# Finished processing a single html page of the paged_html output generated by
518	# Xpdf's pdftohtml: finished massaging that single html page into the right form
519	return $new_dom->to_string;
520	}
521
522	# This subroutine is called to do the PDFPlugin post-processing for all cases
523	# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
524	sub default_convert_post_process
525	{
526	my $self = shift (@_);
527	my ($conv_filename) = @_;
528	my $outhandle=$self->{'outhandle'};
529
530	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
531	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
532
533	# read in file ($text will be in utf8)
534	my $text = "";
535	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
536	$self->read_file ($conv_filename, "utf8", "", \$text);
537
538	# To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
539	# sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
540	# which it then splits on to generate page-based sections. However, that's not what PDFBox
541	# generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
542	# embeds each page in an extra div. The div opener is:
543	# <div style=\"page-break-before:always; page-break-after:always\">
544	# The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
545	# pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
546	# a regex substitution even with regex extensions on.) Later, when we process each section
547	# to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
548	# that increments the pagenum for each subsequent section.
549
550	#$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
551	my $loopcounter = 0; # used later on!
552	$text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
553
554
555	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
556	# for each page). Metadata based on this calculation not set until process()
557	#
558	# Note: this is done even if we are not breaking the document into pages as it might
559	# be useful to give an indication of document length in browser through setting
560	# num_pages as metadata.
561	# Clean html from low and hight surrogates D800âDFFF
562	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
563	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
564	my $num_pages = scalar(@pages);
565	$self->{'num_pages'} = $num_pages;
566
567	if ($self->{'use_sections'}
568	&& $self->{'converted_to'} eq "HTML") {
569
570	print $outhandle "PDFPlugin: Calculating sections...\n";
571
572	# we have "<a name=1></a>" etc for each page
573	# it may be <A name=
574	my @sections = split('<[Aa] name=', $text);
575
576	my $top_section = "";
577
578	if (scalar (@sections) == 1) { #only one section - no split!
579	print $outhandle "PDFPlugin: warning - no sections found\n";
580	} else {
581	$top_section .= shift @sections; # keep HTML header etc as top_section
582	}
583
584	# handle first section specially for title? Or all use first 100...
585
586	my $title = $sections[0];
587	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
588	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
589	$title =~ s/<[^>]*>/ /g;
590	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
591	$title =~ s/^\s+//s;
592	$title =~ s/\s+$//;
593	$title =~ s/\s+/ /gs;
594	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
595	$title =~ s/^\s+//s; # in case title_sub introduced any...
596	$title = substr ($title, 0, 100);
597	$title =~ s/\s\S*$/.../;
598
599
600	if (scalar (@sections) == 1) { # no sections found
601	$top_section .= $sections[0];
602	@sections=();
603	} else {
604	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
605	}
606
607	# add metadata per section...
608	foreach my $section (@sections) {
609	# section names are not always just digits, may be like "outline"
610	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
611
612	$title = $1; # Greenstone does magic if sections are titled digits
613
614	# A title of pagenum=0 means use_sections is being applied on output from PDFBox,
615	# which didn't originally have a <a name=incremented pagenumber></a> to split each page.
616	# Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
617	if($loopcounter > 0 \|\| ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
618	$title = ++$loopcounter;
619	}
620
621	if (! defined($title) ) {
622	print STDERR "no title: $section\n";
623	$title = " "; # get rid of the undefined warning in next line
624	}
625	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
626	$newsection .= "<Metadata name=\"Title\">" . $title
627	. "</Metadata>\n--><br />\n";
628	$newsection .= $section;
629	$newsection .= "<!--</Section>-->\n";
630	$section = $newsection;
631	}
632
633	$text=join('', ($top_section, @sections));
634	}
635
636	if ($self->{'use_sections'}
637	&& $self->{'converted_to'} eq "text") {
638	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
639	}
640
641
642	# The following should no longer be needed, now that strings
643	# read in are Unicode aware (in the Perl sense) rather than
644	# raw binary strings that just happen to be UTF-8 compliant
645
646	# turn any high bytes that aren't valid utf-8 into utf-8.
647	## unicode::ensure_utf8(\$text);
648
649	# Write it out again!
650	$self->utf8_write_file (\$text, $conv_filename);
651	}
652
653
654	# do plugin specific processing of doc_obj for HTML type
655	sub process {
656	my $self = shift (@_);
657	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
658
659	my $result = $self->process_type($base_dir,$file,$doc_obj);
660
661	# fix up the extracted date metadata to be in Greenstone date format,
662	# and fix the capitalisation of 'date'
663	my $cursection = $doc_obj->get_top_section();
664	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
665	$doc_obj->delete_metadata($cursection, "date", $datemeta);
666
667	# We're just interested in the date bit, not the time
668	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
669	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
670	# extracts the ModDate, so it is 0...
671	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
672	my ($year, $month, $day) = ($1,$2,$3);
673	if (defined($year) && defined($month) && defined($day)) {
674	if ($year == 0) {next}
675	if ($year < 100) {$year += 1900} # just to be safe
676	if ($month =~ /^\d$/) {$month="0$month"} # single digit
677	if ($day =~ /^\d$/) {$day="0$day"} # single digit
678	my $date="$year$month$day";
679	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
680	}
681	}
682
683	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
684
685	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
686	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
687	# right if any section has an empty title, or one with letters in it
688	if (&util::is_gs3()) {
689	# but for gs3, paged docs currently use image slider which is ugly if there are no images
690	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
691	} else {
692	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
693	}
694	}
695
696	return $result;
697	}
698
699	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: