Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:keywords set to `Author Date Id Revision`
File size: 16.6 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use strict;
28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29
30	use ReadTextFile;
31	use unicode;
32
33	use AutoLoadConverters;
34	use ConvertBinaryFile;
35
36	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
37
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "html",
43	'desc' => "{ConvertBinaryFile.convert_to.html}" },
44	{ 'name' => "text",
45	'desc' => "{ConvertBinaryFile.convert_to.text}" },
46	{ 'name' => "pagedimg_jpg",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
48	{ 'name' => "pagedimg_gif",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
50	{ 'name' => "pagedimg_png",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
52	];
53
54
55	my $arguments =
56	[
57	{ 'name' => "convert_to",
58	'desc' => "{ConvertBinaryFile.convert_to}",
59	'type' => "enum",
60	'reqd' => "yes",
61	'list' => $convert_to_list,
62	'deft' => "html" },
63	{ 'name' => "process_exp",
64	'desc' => "{BaseImporter.process_exp}",
65	'type' => "regexp",
66	'deft' => &get_default_process_exp(),
67	'reqd' => "no" },
68	{ 'name' => "block_exp",
69	'desc' => "{BaseImporter.block_exp}",
70	'type' => "regexp",
71	'deft' => &get_default_block_exp() },
72	{ 'name' => "metadata_fields",
73	'desc' => "{HTMLPlugin.metadata_fields}",
74	'type' => "string",
75	'deft' => "Title,Author,Subject,Keywords" },
76	{ 'name' => "metadata_field_separator",
77	'desc' => "{HTMLPlugin.metadata_field_separator}",
78	'type' => "string",
79	'deft' => "" },
80	{ 'name' => "noimages",
81	'desc' => "{PDFPlugin.noimages}",
82	'type' => "flag" },
83	{ 'name' => "allowimagesonly",
84	'desc' => "{PDFPlugin.allowimagesonly}",
85	'type' => "flag" },
86	{ 'name' => "complex",
87	'desc' => "{PDFPlugin.complex}",
88	'type' => "flag" },
89	{ 'name' => "nohidden",
90	'desc' => "{PDFPlugin.nohidden}",
91	'type' => "flag" },
92	{ 'name' => "zoom",
93	'desc' => "{PDFPlugin.zoom}",
94	'deft' => "2",
95	'range' => "1,3", # actually the range is 0.5-3
96	'type' => "int" },
97	{ 'name' => "use_sections",
98	'desc' => "{PDFPlugin.use_sections}",
99	'type' => "flag" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" },
103	{ 'name' => "use_realistic_book",
104	'desc' => "{PDFPlugin.use_realistic_book}",
105	'type' => "flag"}
106	];
107
108	my $options = { 'name' => "PDFPlugin",
109	'desc' => "{PDFPlugin.desc}",
110	'abstract' => "no",
111	'inherits' => "yes",
112	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
113	'args' => $arguments };
114
115	sub new {
116	my ($class) = shift (@_);
117	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
118	push(@$pluginlist, $class);
119
120	push(@$inputargs,"-title_sub");
121	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
122
123	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
124	push(@{$hashArgOptLists->{"OptList"}},$options);
125
126	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
127	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
128	my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
129
130	if ($self->{'info_only'}) {
131	# don't worry about any options etc
132	return bless $self, $class;
133	}
134
135	$self = bless $self, $class;
136	$self->{'file_type'} = "PDF";
137
138	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
139	my $zoom = $self->{"zoom"};
140	$self->{'convert_options'} = "-pdf_zoom $zoom";
141	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
142	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
143	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
144	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
145
146	# check convert_to
147	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
148	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
149	$self->{'convert_to'} = "html";
150	}
151	elsif ($self->{'convert_to'} eq "auto") {
152	# choose html ?? is this the best option
153	$self->{'convert_to'} = "html";
154	}
155	if ($self->{'use_realistic_book'}) {
156	if ($self->{'convert_to'} ne "html") {
157	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
158	$self->{'convert_to'} = "html";
159	}
160	}
161	# set convert_to_plugin and convert_to_ext
162	$self->set_standard_convert_settings();
163
164	my $secondary_plugin_name = $self->{'convert_to_plugin'};
165	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
166
167	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
168	$secondary_plugin_options->{$secondary_plugin_name} = [];
169	}
170	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
171
172	# following title_sub removes "Page 1" added by pdftohtml, and a leading
173	# "1", which is often the page number at the top of the page. Bad Luck
174	# if your document title actually starts with "1 " - is there a better way?
175	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
176	my $associate_tail_re = $self->{'associate_tail_re'};
177	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
178	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
179	}
180	push(@$specific_options, "-file_rename_method", "none");
181
182	if ($secondary_plugin_name eq "HTMLPlugin") {
183	# pdftohtml always produces utf8 - What about pdfbox???
184	# push(@$specific_options, "-input_encoding", "utf8");
185	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
186	push(@$specific_options, "-processing_tmp_files");
187	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
188	# to extract these metadata fields from the HEAD META fields
189	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
190	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
191	} else {
192	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
193	}
194	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
195	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
196	}
197	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
198	$self->{'description_tags'} = 1;
199	push(@$specific_options, "-description_tags");
200	}
201	if ($self->{'use_realistic_book'}) {
202	push(@$specific_options, "-use_realistic_book");
203	}
204	}
205	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
206	push(@$specific_options, "-screenviewsize", "1000");
207	push(@$specific_options, "-enable_cache");
208	push(@$specific_options, "-processing_tmp_files");
209	}
210
211	$self = bless $self, $class;
212	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
213	return $self;
214	}
215
216	sub get_default_process_exp {
217	my $self = shift (@_);
218
219	return q^(?i)\.pdf$^;
220	}
221
222	# so we don't inherit HTMLPlug's block exp...
223	sub get_default_block_exp {
224	return "";
225	}
226
227	sub init {
228	my $self = shift (@_);
229
230	# ConvertBinaryFile init
231	$self->SUPER::init(@_);
232	$self->AutoLoadConverters::init(@_);
233
234	}
235
236	sub begin {
237	my $self = shift (@_);
238
239	$self->AutoLoadConverters::begin(@_);
240	$self->SUPER::begin(@_);
241
242	}
243
244	sub deinit {
245	my $self = shift (@_);
246
247	$self->AutoLoadConverters::deinit(@_);
248	$self->SUPER::deinit(@_);
249
250	}
251
252	# By setting hashing to be on ga xml this ensures that two
253	# PDF files that are identical except for the metadata
254	# to hash to different values. Without this, when each PDF
255	# file is converted to HTML there is a chance that they
256	# will both be identical if the conversion utility does
257	# not embed the metadata in the generated HTML. This is
258	# certainly the case when PDFBOX is being used.
259
260	# This change makes this convert to based plugin more
261	# consistent with the original vision that the same document
262	# with different metadata should
263	# be seen as different.
264
265	sub get_oid_hash_type {
266	my $self = shift (@_);
267	return "hash_on_ga_xml";
268	}
269
270
271	sub tmp_area_convert_file {
272
273	my $self = shift (@_);
274	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
275
276	}
277
278	sub convert_post_process
279	{
280	my $self = shift (@_);
281	my ($conv_filename) = @_;
282
283	my $outhandle=$self->{'outhandle'};
284
285	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
286	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
287
288	# read in file ($text will be in utf8)
289	my $text = "";
290	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
291	$self->read_file ($conv_filename, "utf8", "", \$text);
292
293	# To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
294	# sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
295	# which it then splits on to generate page-based sections. However, that's not what PDFBox
296	# generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
297	# embeds each page in an extra div. The div opener is:
298	# <div style=\"page-break-before:always; page-break-after:always\">
299	# The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
300	# pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
301	# a regex substitution even with regex extensions on.) Later, when we process each section
302	# to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
303	# that increments the pagenum for each subsequent section.
304
305	#$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
306	my $loopcounter = 0; # used later on!
307	$text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
308
309
310	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
311	# for each page). Metadata based on this calculation not set until process()
312	#
313	# Note: this is done even if we are not breaking the document into pages as it might
314	# be useful to give an indication of document length in browser through setting
315	# num_pages as metadata.
316	# Clean html from low and hight surrogates D800âDFFF
317	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
318	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
319	my $num_pages = scalar(@pages);
320	$self->{'num_pages'} = $num_pages;
321
322	if ($self->{'use_sections'}
323	&& $self->{'converted_to'} eq "HTML") {
324
325	print $outhandle "PDFPlugin: Calculating sections...\n";
326
327	# we have "<a name=1></a>" etc for each page
328	# it may be <A name=
329	my @sections = split('<[Aa] name=', $text);
330
331	my $top_section = "";
332
333	if (scalar (@sections) == 1) { #only one section - no split!
334	print $outhandle "PDFPlugin: warning - no sections found\n";
335	} else {
336	$top_section .= shift @sections; # keep HTML header etc as top_section
337	}
338
339	# handle first section specially for title? Or all use first 100...
340
341	my $title = $sections[0];
342	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
343	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
344	$title =~ s/<[^>]*>/ /g;
345	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
346	$title =~ s/^\s+//s;
347	$title =~ s/\s+$//;
348	$title =~ s/\s+/ /gs;
349	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
350	$title =~ s/^\s+//s; # in case title_sub introduced any...
351	$title = substr ($title, 0, 100);
352	$title =~ s/\s\S*$/.../;
353
354
355	if (scalar (@sections) == 1) { # no sections found
356	$top_section .= $sections[0];
357	@sections=();
358	} else {
359	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
360	}
361
362	# add metadata per section...
363	foreach my $section (@sections) {
364	# section names are not always just digits, may be like "outline"
365	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
366
367	$title = $1; # Greenstone does magic if sections are titled digits
368
369	# A title of pagenum=0 means use_sections is being applied on output from PDFBox,
370	# which didn't originally have a <a name=incremented pagenumber></a> to split each page.
371	# Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
372	if($loopcounter > 0 \|\| ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
373	$title = ++$loopcounter;
374	}
375
376	if (! defined($title) ) {
377	print STDERR "no title: $section\n";
378	$title = " "; # get rid of the undefined warning in next line
379	}
380	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
381	$newsection .= "<Metadata name=\"Title\">" . $title
382	. "</Metadata>\n--><br />\n";
383	$newsection .= $section;
384	$newsection .= "<!--</Section>-->\n";
385	$section = $newsection;
386	}
387
388	$text=join('', ($top_section, @sections));
389	}
390
391	if ($self->{'use_sections'}
392	&& $self->{'converted_to'} eq "text") {
393	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
394	}
395
396
397	# The following should no longer be needed, now that strings
398	# read in are Unicode aware (in the Perl sense) rather than
399	# raw binary strings that just happen to be UTF-8 compliant
400
401	# turn any high bytes that aren't valid utf-8 into utf-8.
402	## unicode::ensure_utf8(\$text);
403
404	# Write it out again!
405	$self->utf8_write_file (\$text, $conv_filename);
406	}
407
408
409	# do plugin specific processing of doc_obj for HTML type
410	sub process {
411	my $self = shift (@_);
412	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
413
414	my $result = $self->process_type($base_dir,$file,$doc_obj);
415
416	# fix up the extracted date metadata to be in Greenstone date format,
417	# and fix the capitalisation of 'date'
418	my $cursection = $doc_obj->get_top_section();
419	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
420	$doc_obj->delete_metadata($cursection, "date", $datemeta);
421
422	# We're just interested in the date bit, not the time
423	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
424	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
425	# extracts the ModDate, so it is 0...
426	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
427	my ($year, $month, $day) = ($1,$2,$3);
428	if (defined($year) && defined($month) && defined($day)) {
429	if ($year == 0) {next}
430	if ($year < 100) {$year += 1900} # just to be safe
431	if ($month =~ /^\d$/) {$month="0$month"} # single digit
432	if ($day =~ /^\d$/) {$day="0$day"} # single digit
433	my $date="$year$month$day";
434	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
435	}
436	}
437
438	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
439
440	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
441	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
442	# right if any section has an empty title, or one with letters in it
443	if (&util::is_gs3()) {
444	# but for gs3, paged docs currently use image slider which is ugly if there are no images
445	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
446	} else {
447	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
448	}
449	}
450
451	return $result;
452	}
453
454	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: