Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

PDFv1Plugin.pm@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

File size: 15.1 KB

Line
1	###########################################################################
2	#
3	# PDFv1Plugin.pm -- The older pdf plugin, which uses the older pdftohtml
4	# tool that can't handle newer versions of PDFs.
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999-2018 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	package PDFv1Plugin;
27
28	use strict;
29	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
30	no strict 'subs'; # allow filehandles to be variables and viceversa
31
32	use ConvertBinaryFile;
33	use ReadTextFile;
34	use unicode;
35
36
37	@PDFv1Plugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
38
39	# PDFv1 plugin should be returned to being more like it was before AutoLoadConverters/PDFBox extension's inclusion
40	# like the PDFPlugin was at http://trac.greenstone.org/browser/main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm?rev=22597
41	my $convert_to_list =
42	[ { 'name' => "auto",
43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44	{ 'name' => "html",
45	'desc' => "{ConvertBinaryFile.convert_to.html}" },
46	{ 'name' => "text",
47	'desc' => "{ConvertBinaryFile.convert_to.text}" },
48	{ 'name' => "pagedimg_jpg",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
50	{ 'name' => "pagedimg_gif",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
52	{ 'name' => "pagedimg_png",
53	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
54	];
55
56
57	my $arguments =
58	[
59	{ 'name' => "convert_to",
60	'desc' => "{ConvertBinaryFile.convert_to}",
61	'type' => "enum",
62	'reqd' => "yes",
63	'list' => $convert_to_list,
64	'deft' => "html" },
65	{ 'name' => "process_exp",
66	'desc' => "{BaseImporter.process_exp}",
67	'type' => "regexp",
68	'deft' => &get_default_process_exp(),
69	'reqd' => "no" },
70	{ 'name' => "block_exp",
71	'desc' => "{CommonUtil.block_exp}",
72	'type' => "regexp",
73	'deft' => &get_default_block_exp() },
74	{ 'name' => "metadata_fields",
75	'desc' => "{HTMLPlugin.metadata_fields}",
76	'type' => "string",
77	'deft' => "Title,Author,Subject,Keywords" },
78	{ 'name' => "metadata_field_separator",
79	'desc' => "{HTMLPlugin.metadata_field_separator}",
80	'type' => "string",
81	'deft' => "" },
82	{ 'name' => "noimages",
83	'desc' => "{PDFPlugin.noimages}",
84	'type' => "flag" },
85	{ 'name' => "allowimagesonly",
86	'desc' => "{PDFPlugin.allowimagesonly}",
87	'type' => "flag" },
88	{ 'name' => "complex",
89	'desc' => "{PDFPlugin.complex}",
90	'type' => "flag" },
91	{ 'name' => "nohidden",
92	'desc' => "{PDFPlugin.nohidden}",
93	'type' => "flag" },
94	{ 'name' => "zoom",
95	'desc' => "{PDFPlugin.zoom}",
96	'deft' => "2",
97	'range' => "1,3", # actually the range is 0.5-3
98	'type' => "int" },
99	{ 'name' => "use_sections",
100	'desc' => "{PDFPlugin.use_sections}",
101	'type' => "flag" },
102	{ 'name' => "description_tags",
103	'desc' => "{HTMLPlugin.description_tags}",
104	'type' => "flag" },
105	{ 'name' => "use_realistic_book",
106	'desc' => "{PDFPlugin.use_realistic_book}",
107	'type' => "flag"}
108	];
109
110	my $options = { 'name' => "PDFv1Plugin",
111	'desc' => "{PDFPlugin.desc}",
112	'abstract' => "no",
113	'inherits' => "yes",
114	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
115	'args' => $arguments };
116
117	sub new {
118	my ($class) = shift (@_);
119	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120	push(@$pluginlist, $class);
121
122	push(@$inputargs,"-title_sub");
123	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
124
125	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
126	push(@{$hashArgOptLists->{"OptList"}},$options);
127
128	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
129
130	if ($self->{'info_only'}) {
131	# don't worry about any options etc
132	return bless $self, $class;
133	}
134
135	$self = bless $self, $class;
136	$self->{'file_type'} = "PDF";
137
138	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
139	my $zoom = $self->{"zoom"};
140	$self->{'convert_options'} = "-pdf_tool pdftohtml"; # PDFPluginv1 only ever uses the old pdftohtml conversion tool
141	$self->{'convert_options'} .= " -pdf_zoom $zoom";
142	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
143	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
144	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
145	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
146
147	# check convert_to
148	# TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
149	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
150	print STDERR "*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n";
151	print STDERR "*** Use PDFv2Plugin if you really want pdf to text conversion.\n";
152	$self->{'convert_to'} = "html";
153	}
154	elsif ($self->{'convert_to'} eq "auto") {
155	# choose html ?? is this the best option
156	$self->{'convert_to'} = "html";
157	}
158	if ($self->{'use_realistic_book'}) {
159	if ($self->{'convert_to'} ne "html") {
160	print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
161	$self->{'convert_to'} = "html";
162	}
163	}
164	# set convert_to_plugin and convert_to_ext
165	$self->set_standard_convert_settings();
166
167	my $secondary_plugin_name = $self->{'convert_to_plugin'};
168	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
169
170	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
171	$secondary_plugin_options->{$secondary_plugin_name} = [];
172	}
173	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
174
175	# following title_sub removes "Page 1" added by pdftohtml, and a leading
176	# "1", which is often the page number at the top of the page. Bad Luck
177	# if your document title actually starts with "1 " - is there a better way?
178	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
179	my $associate_tail_re = $self->{'associate_tail_re'};
180	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
181	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
182	}
183	push(@$specific_options, "-file_rename_method", "none");
184
185	if ($secondary_plugin_name eq "HTMLPlugin") {
186	# pdftohtml always produces utf8
187	push(@$specific_options, "-input_encoding", "utf8");
188	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
189	push(@$specific_options, "-processing_tmp_files");
190	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
191	# to extract these metadata fields from the HEAD META fields
192	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
193	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
194	} else {
195	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
196	}
197	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
198	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
199	}
200	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
201	$self->{'description_tags'} = 1;
202	push(@$specific_options, "-description_tags");
203	}
204	if ($self->{'use_realistic_book'}) {
205	push(@$specific_options, "-use_realistic_book");
206	}
207	}
208	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
209	push(@$specific_options, "-screenviewsize", "1000");
210	push(@$specific_options, "-enable_cache");
211	push(@$specific_options, "-processing_tmp_files");
212	}
213
214	$self = bless $self, $class; # Q TODO: why does it do this a 2nd time in this function?
215	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
216	return $self;
217	}
218
219	sub get_default_process_exp {
220	my $self = shift (@_);
221
222	return q^(?i)\.pdf$^;
223	}
224
225	# so we don't inherit HTMLPlug's block exp...
226	sub get_default_block_exp {
227	return "";
228	}
229
230
231	# By setting hashing to be on ga xml this ensures that two
232	# PDF files that are identical except for the metadata
233	# to hash to different values. Without this, when each PDF
234	# file is converted to HTML there is a chance that they
235	# will both be identical if the conversion utility does
236	# not embed the metadata in the generated HTML. This is
237	# certainly the case when PDFBOX is being used.
238
239	# This change makes this convert to based plugin more
240	# consistent with the original vision that the same document
241	# with different metadata should
242	# be seen as different.
243
244	sub get_oid_hash_type {
245	my $self = shift (@_);
246	return "hash_on_ga_xml";
247	}
248
249
250	#sub tmp_area_convert_file {
251	#
252	# my $self = shift (@_);
253	# return $self->AutoLoadConverters::tmp_area_convert_file(@_);
254	#
255	#}
256
257	sub convert_post_process
258	{
259	my $self = shift (@_);
260	my ($conv_filename) = @_;
261
262	my $outhandle=$self->{'outhandle'};
263
264	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
265	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
266
267	# read in file ($text will be in utf8)
268	my $text = "";
269	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
270	$self->read_file ($conv_filename, "utf8", "", \$text);
271
272	# Clean html from low and high surrogates D800âDFFF
273	$text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
274
275	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
276	# for each page). Metadata based on this calculation not set until process()
277	#
278	# Note: this is done even if we are not breaking the document into pages as it might
279	# be useful to give an indication of document length in browser through setting
280	# num_pages as metadata.
281	my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
282	my $num_pages = scalar(@pages);
283	$self->{'num_pages'} = $num_pages;
284
285	if ($self->{'use_sections'}
286	&& $self->{'converted_to'} eq "HTML") {
287
288	print $outhandle "PDFPlugin: Calculating sections...\n";
289
290	# we have "<a name=1></a>" etc for each page
291	# it may be <A name=
292	my @sections = split('<[Aa] name=', $text);
293
294	my $top_section = "";
295
296	if (scalar (@sections) == 1) { #only one section - no split!
297	print $outhandle "PDFPlugin: warning - no sections found\n";
298	} else {
299	$top_section .= shift @sections; # keep HTML header etc as top_section
300	}
301
302	# handle first section specially for title? Or all use first 100...
303
304	my $title = $sections[0];
305	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
306	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
307	$title =~ s/<[^>]*>/ /g;
308	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
309	$title =~ s/^\s+//s;
310	$title =~ s/\s+$//;
311	$title =~ s/\s+/ /gs;
312	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
313	$title =~ s/^\s+//s; # in case title_sub introduced any...
314	$title = substr ($title, 0, 100);
315	$title =~ s/\s\S*$/.../;
316
317
318	if (scalar (@sections) == 1) { # no sections found
319	$top_section .= $sections[0];
320	@sections=();
321	} else {
322	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
323	}
324
325	# add metadata per section...
326	foreach my $section (@sections) {
327	# section names are not always just digits, may be like "outline"
328	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
329
330	$title = $1; # Greenstone does magic if sections are titled digits
331	if (! defined($title) ) {
332	print STDERR "no title: $section\n";
333	$title = " "; # get rid of the undefined warning in next line
334	}
335	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
336	$newsection .= "<Metadata name=\"Title\">" . $title
337	. "</Metadata>\n--><br />\n"; #TODO: . "</Metadata>\n--><p>\n";
338	$newsection .= $section;
339	$newsection .= "<!--</Section>-->\n";
340	$section = $newsection;
341	}
342
343	$text=join('', ($top_section, @sections));
344	}
345
346	if ($self->{'use_sections'}
347	&& $self->{'converted_to'} eq "text") {
348	print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
349	}
350
351
352	# The following should no longer be needed, now that strings
353	# read in are Unicode aware (in the Perl sense) rather than
354	# raw binary strings that just happen to be UTF-8 compliant
355
356	# turn any high bytes that aren't valid utf-8 into utf-8.
357	## unicode::ensure_utf8(\$text);
358
359	# Write it out again!
360	$self->utf8_write_file (\$text, $conv_filename);
361	}
362
363
364	# do plugin specific processing of doc_obj for HTML type
365	sub process {
366	my $self = shift (@_);
367	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
368
369	my $result = $self->process_type($base_dir,$file,$doc_obj);
370
371	# fix up the extracted date metadata to be in Greenstone date format,
372	# and fix the capitalisation of 'date'
373	my $cursection = $doc_obj->get_top_section();
374	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
375	$doc_obj->delete_metadata($cursection, "date", $datemeta);
376
377	# We're just interested in the date bit, not the time
378	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
379	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
380	# extracts the ModDate, so it is 0...
381	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
382	my ($year, $month, $day) = ($1,$2,$3);
383	if (defined($year) && defined($month) && defined($day)) {
384	if ($year == 0) {next}
385	if ($year < 100) {$year += 1900} # just to be safe
386	if ($month =~ /^\d$/) {$month="0$month"} # single digit
387	if ($day =~ /^\d$/) {$day="0$day"} # single digit
388	my $date="$year$month$day";
389	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
390	}
391	}
392
393	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
394
395	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
396	# For gs2 we explicitly make it a paged document, cos greenstone won't get it
397	# right if any section has an empty title, or one with letters in it
398	if (&util::is_gs3()) {
399	# but for gs3, paged docs currently use image slider which is ugly if there are no images
400	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
401	} else {
402	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
403	}
404	}
405
406	return $result;
407	}
408
409	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm@ 32273

Download in other formats: