Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 22702

Last change on this file since 22702 was 22702, checked in by davidb, 14 years ago
Introduction of new plugin AutoloadConverterScripting to replace duplicated code that turns up in plugins like OOConvertBinaryFile and PBConvertBinaryFile
Property svn:keywords set to `Author Date Id Revision`
File size: 12.4 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use ReadTextFile;
28	use unicode;
29	use strict;
30	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
31
32	sub BEGIN {
33	$::autoloadConverterScripting="PDFBoxConverter";
34	require AutoloadConverterScripting;
35
36	@PDFPlugin::ISA = ('AutoloadConverterScripting', 'ReadTextFile');
37	}
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "html",
43	'desc' => "{ConvertBinaryFile.convert_to.html}" },
44	{ 'name' => "text",
45	'desc' => "{ConvertBinaryFile.convert_to.text}" },
46	{ 'name' => "pagedimg_jpg",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
48	{ 'name' => "pagedimg_gif",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
50	{ 'name' => "pagedimg_png",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
52	];
53
54
55	my $arguments =
56	[
57	{ 'name' => "convert_to",
58	'desc' => "{ConvertBinaryFile.convert_to}",
59	'type' => "enum",
60	'reqd' => "yes",
61	'list' => $convert_to_list,
62	'deft' => "html" },
63	{ 'name' => "process_exp",
64	'desc' => "{BasePlugin.process_exp}",
65	'type' => "regexp",
66	'deft' => &get_default_process_exp(),
67	'reqd' => "no" },
68	{ 'name' => "block_exp",
69	'desc' => "{BasePlugin.block_exp}",
70	'type' => "regexp",
71	'deft' => &get_default_block_exp() },
72	{ 'name' => "metadata_fields",
73	'desc' => "{HTMLPlugin.metadata_fields}",
74	'type' => "string",
75	'deft' => "" },
76	{ 'name' => "metadata_field_separator",
77	'desc' => "{HTMLPlugin.metadata_field_separator}",
78	'type' => "string",
79	'deft' => "" },
80	{ 'name' => "noimages",
81	'desc' => "{PDFPlugin.noimages}",
82	'type' => "flag" },
83	{ 'name' => "allowimagesonly",
84	'desc' => "{PDFPlugin.allowimagesonly}",
85	'type' => "flag" },
86	{ 'name' => "complex",
87	'desc' => "{PDFPlugin.complex}",
88	'type' => "flag" },
89	{ 'name' => "nohidden",
90	'desc' => "{PDFPlugin.nohidden}",
91	'type' => "flag" },
92	{ 'name' => "zoom",
93	'desc' => "{PDFPlugin.zoom}",
94	'deft' => "2",
95	'range' => "1,3", # actually the range is 0.5-3
96	'type' => "int" },
97	{ 'name' => "use_sections",
98	'desc' => "{PDFPlugin.use_sections}",
99	'type' => "flag" },
100	{ 'name' => "description_tags",
101	'desc' => "{HTMLPlugin.description_tags}",
102	'type' => "flag" }
103	];
104
105	my $options = { 'name' => "PDFPlugin",
106	'desc' => "{PDFPlugin.desc}",
107	'abstract' => "no",
108	'inherits' => "yes",
109	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
110	'args' => $arguments };
111
112	sub new {
113	my ($class) = shift (@_);
114	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
115	push(@$pluginlist, $class);
116
117	push(@$inputargs,"-title_sub");
118	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
119
120	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
121	push(@{$hashArgOptLists->{"OptList"}},$options);
122
123	my $self = new AutoloadConverterScripting("PDFBoxConverter",$pluginlist, $inputargs, $hashArgOptLists);
124
125	if ($self->{'info_only'}) {
126	# don't worry about any options etc
127	return bless $self, $class;
128	}
129
130	$self->{'filename_extension'} = "pdf";
131	$self->{'file_type'} = "PDF";
132
133	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
134	my $zoom = $self->{"zoom"};
135	$self->{'convert_options'} = "-pdf_zoom $zoom";
136	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
137	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
138	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
139	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
140
141	# check convert_to
142	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
143	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
144	$self->{'convert_to'} = "html";
145	}
146	elsif ($self->{'convert_to'} eq "auto") {
147	# choose html ?? is this the best option
148	$self->{'convert_to'} = "html";
149	}
150	# set convert_to_plugin and convert_to_ext
151	$self->set_standard_convert_settings();
152
153	my $secondary_plugin_name = $self->{'convert_to_plugin'};
154	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
155
156	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
157	$secondary_plugin_options->{$secondary_plugin_name} = [];
158	}
159	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
160
161	# following title_sub removes "Page 1" added by pdftohtml, and a leading
162	# "1", which is often the page number at the top of the page. Bad Luck
163	# if your document title actually starts with "1 " - is there a better way?
164	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
165	my $associate_tail_re = $self->{'associate_tail_re'};
166	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
167	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
168	}
169	push(@$specific_options, "-file_rename_method", "none");
170
171	if ($secondary_plugin_name eq "HTMLPlugin") {
172	# pdftohtml always produces utf8
173	push(@$specific_options, "-input_encoding", "utf8");
174	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
175	push(@$specific_options, "-processing_tmp_files");
176	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
177	# to extract these metadata fields from the HEAD META fields
178	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
179	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
180	} else {
181	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
182	}
183	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
184	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
185	}
186	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
187	$self->{'description_tags'} = 1;
188	push(@$specific_options, "-description_tags");
189	}
190	}
191	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
192	push(@$specific_options, "-screenviewsize", "1000");
193	push(@$specific_options, "-enable_cache");
194	push(@$specific_options, "-processing_tmp_files");
195	}
196
197	$self = bless $self, $class;
198	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
199	return $self;
200	}
201
202	sub get_default_process_exp {
203	my $self = shift (@_);
204
205	return q^(?i)\.pdf$^;
206	}
207
208	# so we don't inherit HTMLPlug's block exp...
209	sub get_default_block_exp {
210	return "";
211	}
212
213	sub convert_post_process
214	{
215	my $self = shift (@_);
216	my ($conv_filename) = @_;
217
218	my $outhandle=$self->{'outhandle'};
219
220	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
221	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
222
223	# read in file ($text will be in utf8)
224	my $text = "";
225	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
226	$self->read_file ($conv_filename, "utf8", "", \$text);
227
228	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
229	# for each page). Metadata based on this calculation not set until process()
230	#
231	# Note: this is done even if we are not breaking to document into pages as it might
232	# be useful to give an indication of document length in browser through setting
233	# num_pages as metadata.
234	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
235	my $num_pages = scalar(@pages);
236	$self->{'num_pages'} = $num_pages;
237
238	if ($self->{'use_sections'}
239	&& $self->{'converted_to'} eq "HTML") {
240
241	print $outhandle "PDFPlugin: Calculating sections...\n";
242
243	# we have "<a name=1></a>" etc for each page
244	# it may be <A name=
245	my @sections = split('<[Aa] name=', $text);
246
247	my $top_section = "";
248
249	if (scalar (@sections) == 1) { #only one section - no split!
250	print $outhandle "PDFPlugin: warning - no sections found\n";
251	} else {
252	$top_section .= shift @sections; # keep HTML header etc as top_section
253	}
254
255	# handle first section specially for title? Or all use first 100...
256
257	my $title = $sections[0];
258	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
259	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
260	$title =~ s/<[^>]*>/ /g;
261	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
262	$title =~ s/^\s+//s;
263	$title =~ s/\s+$//;
264	$title =~ s/\s+/ /gs;
265	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
266	$title =~ s/^\s+//s; # in case title_sub introduced any...
267	$title = substr ($title, 0, 100);
268	$title =~ s/\s\S*$/.../;
269
270
271	if (scalar (@sections) == 1) { # no sections found
272	$top_section .= $sections[0];
273	@sections=();
274	} else {
275	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
276	}
277
278	# add metadata per section...
279	foreach my $section (@sections) {
280	# section names are not always just digits, may be like "outline"
281	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
282
283	$title = $1; # Greenstone does magic if sections are titled digits
284	if (! defined($title) ) {
285	print STDERR "no title: $section\n";
286	$title = " "; # get rid of the undefined warning in next line
287	}
288	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
289	$newsection .= "<Metadata name=\"Title\">" . $title
290	. "</Metadata>\n--><p>\n";
291	$newsection .= $section;
292	$newsection .= "<!--</Section>-->\n";
293	$section = $newsection;
294	}
295
296	$text=join('', ($top_section, @sections));
297	}
298
299	# turn any high bytes that aren't valid utf-8 into utf-8.
300	unicode::ensure_utf8(\$text);
301
302	# Write it out again!
303	$self->utf8_write_file (\$text, $conv_filename);
304	}
305
306
307	# do plugin specific processing of doc_obj for HTML type
308	sub process {
309	my $self = shift (@_);
310	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
311
312	my $result = $self->process_type($base_dir,$file,$doc_obj);
313
314	# fix up the extracted date metadata to be in Greenstone date format,
315	# and fix the capitalisation of 'date'
316	my $cursection = $doc_obj->get_top_section();
317	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
318	$doc_obj->delete_metadata($cursection, "date", $datemeta);
319
320	# We're just interested in the date bit, not the time
321	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
322	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
323	# extracts the ModDate, so it is 0...
324	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
325	my ($year, $month, $day) = ($1,$2,$3);
326	if (defined($year) && defined($month) && defined($day)) {
327	if ($year == 0) {next}
328	if ($year < 100) {$year += 1900} # just to be safe
329	if ($month =~ /^\d$/) {$month="0$month"} # single digit
330	if ($day =~ /^\d$/) {$day="0$day"} # single digit
331	my $date="$year$month$day";
332	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
333	}
334	}
335
336	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
337
338	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
339	# we explicitly make it a paged document, cos greenstone won't get it
340	# right if any section has an empty title, or one with letters in it
341	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
342	}
343
344	return $result;
345	}
346
347	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: