Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 22709

Last change on this file since 22709 was 22705, checked in by davidb, 14 years ago
User of AutoloadConverterScripting expanded to encompass PowerPoint and Excel. No longer need OOConvertBinaryFile and PBConvertBinaryFile
Property svn:keywords set to `Author Date Id Revision`
File size: 12.3 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use strict;
28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29
30	use ReadTextFile;
31	use unicode;
32
33	use AutoloadConverterScripting;
34
35	@PDFPlugin::ISA = ('AutoloadConverterScripting', 'ReadTextFile');
36
37
38	my $convert_to_list =
39	[ { 'name' => "auto",
40	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
41	{ 'name' => "html",
42	'desc' => "{ConvertBinaryFile.convert_to.html}" },
43	{ 'name' => "text",
44	'desc' => "{ConvertBinaryFile.convert_to.text}" },
45	{ 'name' => "pagedimg_jpg",
46	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
47	{ 'name' => "pagedimg_gif",
48	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
49	{ 'name' => "pagedimg_png",
50	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
51	];
52
53
54	my $arguments =
55	[
56	{ 'name' => "convert_to",
57	'desc' => "{ConvertBinaryFile.convert_to}",
58	'type' => "enum",
59	'reqd' => "yes",
60	'list' => $convert_to_list,
61	'deft' => "html" },
62	{ 'name' => "process_exp",
63	'desc' => "{BasePlugin.process_exp}",
64	'type' => "regexp",
65	'deft' => &get_default_process_exp(),
66	'reqd' => "no" },
67	{ 'name' => "block_exp",
68	'desc' => "{BasePlugin.block_exp}",
69	'type' => "regexp",
70	'deft' => &get_default_block_exp() },
71	{ 'name' => "metadata_fields",
72	'desc' => "{HTMLPlugin.metadata_fields}",
73	'type' => "string",
74	'deft' => "" },
75	{ 'name' => "metadata_field_separator",
76	'desc' => "{HTMLPlugin.metadata_field_separator}",
77	'type' => "string",
78	'deft' => "" },
79	{ 'name' => "noimages",
80	'desc' => "{PDFPlugin.noimages}",
81	'type' => "flag" },
82	{ 'name' => "allowimagesonly",
83	'desc' => "{PDFPlugin.allowimagesonly}",
84	'type' => "flag" },
85	{ 'name' => "complex",
86	'desc' => "{PDFPlugin.complex}",
87	'type' => "flag" },
88	{ 'name' => "nohidden",
89	'desc' => "{PDFPlugin.nohidden}",
90	'type' => "flag" },
91	{ 'name' => "zoom",
92	'desc' => "{PDFPlugin.zoom}",
93	'deft' => "2",
94	'range' => "1,3", # actually the range is 0.5-3
95	'type' => "int" },
96	{ 'name' => "use_sections",
97	'desc' => "{PDFPlugin.use_sections}",
98	'type' => "flag" },
99	{ 'name' => "description_tags",
100	'desc' => "{HTMLPlugin.description_tags}",
101	'type' => "flag" }
102	];
103
104	my $options = { 'name' => "PDFPlugin",
105	'desc' => "{PDFPlugin.desc}",
106	'abstract' => "no",
107	'inherits' => "yes",
108	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
109	'args' => $arguments };
110
111	sub new {
112	my ($class) = shift (@_);
113	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
114	push(@$pluginlist, $class);
115
116	push(@$inputargs,"-title_sub");
117	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
118
119	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120	push(@{$hashArgOptLists->{"OptList"}},$options);
121
122	my $self = new AutoloadConverterScripting("PDFBoxConverter",$pluginlist, $inputargs, $hashArgOptLists);
123
124	if ($self->{'info_only'}) {
125	# don't worry about any options etc
126	return bless $self, $class;
127	}
128
129	$self->{'filename_extension'} = "pdf";
130	$self->{'file_type'} = "PDF";
131
132	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
133	my $zoom = $self->{"zoom"};
134	$self->{'convert_options'} = "-pdf_zoom $zoom";
135	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
136	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
137	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
138	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
139
140	# check convert_to
141	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
142	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
143	$self->{'convert_to'} = "html";
144	}
145	elsif ($self->{'convert_to'} eq "auto") {
146	# choose html ?? is this the best option
147	$self->{'convert_to'} = "html";
148	}
149	# set convert_to_plugin and convert_to_ext
150	$self->set_standard_convert_settings();
151
152	my $secondary_plugin_name = $self->{'convert_to_plugin'};
153	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
154
155	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
156	$secondary_plugin_options->{$secondary_plugin_name} = [];
157	}
158	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
159
160	# following title_sub removes "Page 1" added by pdftohtml, and a leading
161	# "1", which is often the page number at the top of the page. Bad Luck
162	# if your document title actually starts with "1 " - is there a better way?
163	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
164	my $associate_tail_re = $self->{'associate_tail_re'};
165	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
166	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
167	}
168	push(@$specific_options, "-file_rename_method", "none");
169
170	if ($secondary_plugin_name eq "HTMLPlugin") {
171	# pdftohtml always produces utf8
172	push(@$specific_options, "-input_encoding", "utf8");
173	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
174	push(@$specific_options, "-processing_tmp_files");
175	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
176	# to extract these metadata fields from the HEAD META fields
177	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
178	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
179	} else {
180	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
181	}
182	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
183	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
184	}
185	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
186	$self->{'description_tags'} = 1;
187	push(@$specific_options, "-description_tags");
188	}
189	}
190	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
191	push(@$specific_options, "-screenviewsize", "1000");
192	push(@$specific_options, "-enable_cache");
193	push(@$specific_options, "-processing_tmp_files");
194	}
195
196	$self = bless $self, $class;
197	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
198	return $self;
199	}
200
201	sub get_default_process_exp {
202	my $self = shift (@_);
203
204	return q^(?i)\.pdf$^;
205	}
206
207	# so we don't inherit HTMLPlug's block exp...
208	sub get_default_block_exp {
209	return "";
210	}
211
212	sub convert_post_process
213	{
214	my $self = shift (@_);
215	my ($conv_filename) = @_;
216
217	my $outhandle=$self->{'outhandle'};
218
219	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
220	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
221
222	# read in file ($text will be in utf8)
223	my $text = "";
224	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
225	$self->read_file ($conv_filename, "utf8", "", \$text);
226
227	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
228	# for each page). Metadata based on this calculation not set until process()
229	#
230	# Note: this is done even if we are not breaking to document into pages as it might
231	# be useful to give an indication of document length in browser through setting
232	# num_pages as metadata.
233	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
234	my $num_pages = scalar(@pages);
235	$self->{'num_pages'} = $num_pages;
236
237	if ($self->{'use_sections'}
238	&& $self->{'converted_to'} eq "HTML") {
239
240	print $outhandle "PDFPlugin: Calculating sections...\n";
241
242	# we have "<a name=1></a>" etc for each page
243	# it may be <A name=
244	my @sections = split('<[Aa] name=', $text);
245
246	my $top_section = "";
247
248	if (scalar (@sections) == 1) { #only one section - no split!
249	print $outhandle "PDFPlugin: warning - no sections found\n";
250	} else {
251	$top_section .= shift @sections; # keep HTML header etc as top_section
252	}
253
254	# handle first section specially for title? Or all use first 100...
255
256	my $title = $sections[0];
257	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
258	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
259	$title =~ s/<[^>]*>/ /g;
260	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
261	$title =~ s/^\s+//s;
262	$title =~ s/\s+$//;
263	$title =~ s/\s+/ /gs;
264	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
265	$title =~ s/^\s+//s; # in case title_sub introduced any...
266	$title = substr ($title, 0, 100);
267	$title =~ s/\s\S*$/.../;
268
269
270	if (scalar (@sections) == 1) { # no sections found
271	$top_section .= $sections[0];
272	@sections=();
273	} else {
274	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
275	}
276
277	# add metadata per section...
278	foreach my $section (@sections) {
279	# section names are not always just digits, may be like "outline"
280	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
281
282	$title = $1; # Greenstone does magic if sections are titled digits
283	if (! defined($title) ) {
284	print STDERR "no title: $section\n";
285	$title = " "; # get rid of the undefined warning in next line
286	}
287	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
288	$newsection .= "<Metadata name=\"Title\">" . $title
289	. "</Metadata>\n--><p>\n";
290	$newsection .= $section;
291	$newsection .= "<!--</Section>-->\n";
292	$section = $newsection;
293	}
294
295	$text=join('', ($top_section, @sections));
296	}
297
298	# turn any high bytes that aren't valid utf-8 into utf-8.
299	unicode::ensure_utf8(\$text);
300
301	# Write it out again!
302	$self->utf8_write_file (\$text, $conv_filename);
303	}
304
305
306	# do plugin specific processing of doc_obj for HTML type
307	sub process {
308	my $self = shift (@_);
309	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
310
311	my $result = $self->process_type($base_dir,$file,$doc_obj);
312
313	# fix up the extracted date metadata to be in Greenstone date format,
314	# and fix the capitalisation of 'date'
315	my $cursection = $doc_obj->get_top_section();
316	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
317	$doc_obj->delete_metadata($cursection, "date", $datemeta);
318
319	# We're just interested in the date bit, not the time
320	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
321	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
322	# extracts the ModDate, so it is 0...
323	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
324	my ($year, $month, $day) = ($1,$2,$3);
325	if (defined($year) && defined($month) && defined($day)) {
326	if ($year == 0) {next}
327	if ($year < 100) {$year += 1900} # just to be safe
328	if ($month =~ /^\d$/) {$month="0$month"} # single digit
329	if ($day =~ /^\d$/) {$day="0$day"} # single digit
330	my $date="$year$month$day";
331	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
332	}
333	}
334
335	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
336
337	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
338	# we explicitly make it a paged document, cos greenstone won't get it
339	# right if any section has an empty title, or one with letters in it
340	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
341	}
342
343	return $result;
344	}
345
346	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: