Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 22861

Last change on this file since 22861 was 22861, checked in by kjdon, 14 years ago
now uses new AutoLoadConverters instead of AutoloadConverterScripting. This doesn't inherit from ConvertBinaryFile, so these plugins all inherit from that again. Now we can initialise the converters, fix up the modifications to the arguments, before parsing them when we do new ConvertBinaryFile. PowerPointPlugin incomplete and still needs lots of work done for processing the result on open office conversion
Property svn:keywords set to `Author Date Id Revision`
File size: 13.1 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use strict;
28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29
30	use ReadTextFile;
31	use unicode;
32
33	use AutoLoadConverters;
34
35	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
36
37
38	my $convert_to_list =
39	[ { 'name' => "auto",
40	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
41	{ 'name' => "html",
42	'desc' => "{ConvertBinaryFile.convert_to.html}" },
43	{ 'name' => "text",
44	'desc' => "{ConvertBinaryFile.convert_to.text}" },
45	{ 'name' => "pagedimg_jpg",
46	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
47	{ 'name' => "pagedimg_gif",
48	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
49	{ 'name' => "pagedimg_png",
50	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
51	];
52
53
54	my $arguments =
55	[
56	{ 'name' => "convert_to",
57	'desc' => "{ConvertBinaryFile.convert_to}",
58	'type' => "enum",
59	'reqd' => "yes",
60	'list' => $convert_to_list,
61	'deft' => "html" },
62	{ 'name' => "process_exp",
63	'desc' => "{BasePlugin.process_exp}",
64	'type' => "regexp",
65	'deft' => &get_default_process_exp(),
66	'reqd' => "no" },
67	{ 'name' => "block_exp",
68	'desc' => "{BasePlugin.block_exp}",
69	'type' => "regexp",
70	'deft' => &get_default_block_exp() },
71	{ 'name' => "metadata_fields",
72	'desc' => "{HTMLPlugin.metadata_fields}",
73	'type' => "string",
74	'deft' => "" },
75	{ 'name' => "metadata_field_separator",
76	'desc' => "{HTMLPlugin.metadata_field_separator}",
77	'type' => "string",
78	'deft' => "" },
79	{ 'name' => "noimages",
80	'desc' => "{PDFPlugin.noimages}",
81	'type' => "flag" },
82	{ 'name' => "allowimagesonly",
83	'desc' => "{PDFPlugin.allowimagesonly}",
84	'type' => "flag" },
85	{ 'name' => "complex",
86	'desc' => "{PDFPlugin.complex}",
87	'type' => "flag" },
88	{ 'name' => "nohidden",
89	'desc' => "{PDFPlugin.nohidden}",
90	'type' => "flag" },
91	{ 'name' => "zoom",
92	'desc' => "{PDFPlugin.zoom}",
93	'deft' => "2",
94	'range' => "1,3", # actually the range is 0.5-3
95	'type' => "int" },
96	{ 'name' => "use_sections",
97	'desc' => "{PDFPlugin.use_sections}",
98	'type' => "flag" },
99	{ 'name' => "description_tags",
100	'desc' => "{HTMLPlugin.description_tags}",
101	'type' => "flag" }
102	];
103
104	my $options = { 'name' => "PDFPlugin",
105	'desc' => "{PDFPlugin.desc}",
106	'abstract' => "no",
107	'inherits' => "yes",
108	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
109	'args' => $arguments };
110
111	sub new {
112	my ($class) = shift (@_);
113	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
114	push(@$pluginlist, $class);
115
116	push(@$inputargs,"-title_sub");
117	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
118
119	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
120	push(@{$hashArgOptLists->{"OptList"}},$options);
121
122	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
123	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
124	my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
125
126	if ($self->{'info_only'}) {
127	# don't worry about any options etc
128	return bless $self, $class;
129	}
130
131	$self = bless $self, $class;
132	$self->{'filename_extension'} = "pdf";
133	$self->{'file_type'} = "PDF";
134
135	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
136	my $zoom = $self->{"zoom"};
137	$self->{'convert_options'} = "-pdf_zoom $zoom";
138	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
139	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
140	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
141	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
142
143	# check convert_to
144	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
145	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
146	$self->{'convert_to'} = "html";
147	}
148	elsif ($self->{'convert_to'} eq "auto") {
149	# choose html ?? is this the best option
150	$self->{'convert_to'} = "html";
151	}
152	# set convert_to_plugin and convert_to_ext
153	$self->set_standard_convert_settings();
154
155	my $secondary_plugin_name = $self->{'convert_to_plugin'};
156	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
157
158	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
159	$secondary_plugin_options->{$secondary_plugin_name} = [];
160	}
161	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
162
163	# following title_sub removes "Page 1" added by pdftohtml, and a leading
164	# "1", which is often the page number at the top of the page. Bad Luck
165	# if your document title actually starts with "1 " - is there a better way?
166	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
167	my $associate_tail_re = $self->{'associate_tail_re'};
168	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
169	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
170	}
171	push(@$specific_options, "-file_rename_method", "none");
172
173	if ($secondary_plugin_name eq "HTMLPlugin") {
174	# pdftohtml always produces utf8 - What about pdfbox???
175	push(@$specific_options, "-input_encoding", "utf8");
176	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
177	push(@$specific_options, "-processing_tmp_files");
178	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
179	# to extract these metadata fields from the HEAD META fields
180	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
181	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
182	} else {
183	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
184	}
185	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
186	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
187	}
188	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
189	$self->{'description_tags'} = 1;
190	push(@$specific_options, "-description_tags");
191	}
192	}
193	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
194	push(@$specific_options, "-screenviewsize", "1000");
195	push(@$specific_options, "-enable_cache");
196	push(@$specific_options, "-processing_tmp_files");
197	}
198
199	$self = bless $self, $class;
200	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
201	return $self;
202	}
203
204	sub get_default_process_exp {
205	my $self = shift (@_);
206
207	return q^(?i)\.pdf$^;
208	}
209
210	# so we don't inherit HTMLPlug's block exp...
211	sub get_default_block_exp {
212	return "";
213	}
214
215	sub init {
216	my $self = shift (@_);
217
218	# ConvertBinaryFile init
219	$self->SUPER::init(@_);
220	$self->AutoLoadConverters::init();
221
222	}
223
224	sub begin {
225	my $self = shift (@_);
226
227	$self->AutoLoadConverters::begin();
228	$self->SUPER::begin(@_);
229
230	}
231
232	sub deinit {
233	my $self = shift (@_);
234
235	$self->AutoLoadConverters::deinit();
236	$self->SUPER::deinit(@_);
237
238	}
239
240
241	sub tmp_area_convert_file {
242
243	my $self = shift (@_);
244	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
245
246	}
247
248	sub convert_post_process
249	{
250	my $self = shift (@_);
251	my ($conv_filename) = @_;
252
253	my $outhandle=$self->{'outhandle'};
254
255	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
256	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
257
258	# read in file ($text will be in utf8)
259	my $text = "";
260	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
261	$self->read_file ($conv_filename, "utf8", "", \$text);
262
263	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
264	# for each page). Metadata based on this calculation not set until process()
265	#
266	# Note: this is done even if we are not breaking to document into pages as it might
267	# be useful to give an indication of document length in browser through setting
268	# num_pages as metadata.
269	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
270	my $num_pages = scalar(@pages);
271	$self->{'num_pages'} = $num_pages;
272
273	if ($self->{'use_sections'}
274	&& $self->{'converted_to'} eq "HTML") {
275
276	print $outhandle "PDFPlugin: Calculating sections...\n";
277
278	# we have "<a name=1></a>" etc for each page
279	# it may be <A name=
280	my @sections = split('<[Aa] name=', $text);
281
282	my $top_section = "";
283
284	if (scalar (@sections) == 1) { #only one section - no split!
285	print $outhandle "PDFPlugin: warning - no sections found\n";
286	} else {
287	$top_section .= shift @sections; # keep HTML header etc as top_section
288	}
289
290	# handle first section specially for title? Or all use first 100...
291
292	my $title = $sections[0];
293	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
294	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
295	$title =~ s/<[^>]*>/ /g;
296	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
297	$title =~ s/^\s+//s;
298	$title =~ s/\s+$//;
299	$title =~ s/\s+/ /gs;
300	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
301	$title =~ s/^\s+//s; # in case title_sub introduced any...
302	$title = substr ($title, 0, 100);
303	$title =~ s/\s\S*$/.../;
304
305
306	if (scalar (@sections) == 1) { # no sections found
307	$top_section .= $sections[0];
308	@sections=();
309	} else {
310	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
311	}
312
313	# add metadata per section...
314	foreach my $section (@sections) {
315	# section names are not always just digits, may be like "outline"
316	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
317
318	$title = $1; # Greenstone does magic if sections are titled digits
319	if (! defined($title) ) {
320	print STDERR "no title: $section\n";
321	$title = " "; # get rid of the undefined warning in next line
322	}
323	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
324	$newsection .= "<Metadata name=\"Title\">" . $title
325	. "</Metadata>\n--><p>\n";
326	$newsection .= $section;
327	$newsection .= "<!--</Section>-->\n";
328	$section = $newsection;
329	}
330
331	$text=join('', ($top_section, @sections));
332	}
333
334	# turn any high bytes that aren't valid utf-8 into utf-8.
335	unicode::ensure_utf8(\$text);
336
337	# Write it out again!
338	$self->utf8_write_file (\$text, $conv_filename);
339	}
340
341
342	# do plugin specific processing of doc_obj for HTML type
343	sub process {
344	my $self = shift (@_);
345	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
346
347	my $result = $self->process_type($base_dir,$file,$doc_obj);
348
349	# fix up the extracted date metadata to be in Greenstone date format,
350	# and fix the capitalisation of 'date'
351	my $cursection = $doc_obj->get_top_section();
352	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
353	$doc_obj->delete_metadata($cursection, "date", $datemeta);
354
355	# We're just interested in the date bit, not the time
356	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
357	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
358	# extracts the ModDate, so it is 0...
359	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
360	my ($year, $month, $day) = ($1,$2,$3);
361	if (defined($year) && defined($month) && defined($day)) {
362	if ($year == 0) {next}
363	if ($year < 100) {$year += 1900} # just to be safe
364	if ($month =~ /^\d$/) {$month="0$month"} # single digit
365	if ($day =~ /^\d$/) {$day="0$day"} # single digit
366	my $date="$year$month$day";
367	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
368	}
369	}
370
371	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
372
373	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
374	# we explicitly make it a paged document, cos greenstone won't get it
375	# right if any section has an empty title, or one with letters in it
376	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
377	}
378
379	return $result;
380	}
381
382	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: