Context Navigation

source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 18406

Last change on this file since 18406 was 18406, checked in by ak19, 15 years ago
Modified srcreplaceable plugins (plugins which operate on docs where the source file can be replaced with their converted htmls) to set the file_rename_method to none for secondary plugins (for Text, HTML, and PagedImage plugins) so that the file is not renamed several times.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.5 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use ConvertBinaryFile;
28	use ReadTextFile;
29	use unicode;
30	use strict;
31	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
32
33	sub BEGIN {
34	@PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
35	}
36
37	my $convert_to_list =
38	[ { 'name' => "auto",
39	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
40	{ 'name' => "html",
41	'desc' => "{ConvertBinaryFile.convert_to.html}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
50	];
51
52
53	my $arguments =
54	[
55	{ 'name' => "convert_to",
56	'desc' => "{ConvertBinaryFile.convert_to}",
57	'type' => "enum",
58	'reqd' => "yes",
59	'list' => $convert_to_list,
60	'deft' => "html" },
61	{ 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_process_exp(),
65	'reqd' => "no" },
66	{ 'name' => "block_exp",
67	'desc' => "{BasePlugin.block_exp}",
68	'type' => "regexp",
69	'deft' => &get_default_block_exp() },
70	{ 'name' => "metadata_fields",
71	'desc' => "{HTMLPlugin.metadata_fields}",
72	'type' => "string",
73	'deft' => "" },
74	{ 'name' => "noimages",
75	'desc' => "{PDFPlugin.noimages}",
76	'type' => "flag" },
77	{ 'name' => "allowimagesonly",
78	'desc' => "{PDFPlugin.allowimagesonly}",
79	'type' => "flag" },
80	{ 'name' => "complex",
81	'desc' => "{PDFPlugin.complex}",
82	'type' => "flag" },
83	{ 'name' => "nohidden",
84	'desc' => "{PDFPlugin.nohidden}",
85	'type' => "flag" },
86	{ 'name' => "zoom",
87	'desc' => "{PDFPlugin.zoom}",
88	'deft' => "2",
89	'range' => "1,3", # actually the range is 0.5-3
90	'type' => "int" },
91	{ 'name' => "use_sections",
92	'desc' => "{PDFPlugin.use_sections}",
93	'type' => "flag" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" }
97	];
98
99	my $options = { 'name' => "PDFPlugin",
100	'desc' => "{PDFPlugin.desc}",
101	'abstract' => "no",
102	'inherits' => "yes",
103	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	push(@$inputargs,"-title_sub");
112	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117	my @arg_array = @$inputargs;
118	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
119
120	if ($self->{'info_only'}) {
121	# don't worry about any options etc
122	return bless $self, $class;
123	}
124
125	$self->{'filename_extension'} = "pdf";
126	$self->{'file_type'} = "PDF";
127
128	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
129	my $zoom = $self->{"zoom"};
130	$self->{'convert_options'} = "-pdf_zoom $zoom";
131	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
132	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
133	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
134	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
135
136	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
137
138	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
139	$secondary_plugin_options->{'HTMLPlugin'} = [];
140	}
141	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
142	$secondary_plugin_options->{'TextPlugin'} = [];
143	}
144	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage\|pagedimg).*/i) {
145	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
146	$secondary_plugin_options->{'PagedImagePlugin'} = [];
147	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
148	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
149	push(@$pagedimg_options, "-screenviewsize", "1000");
150	}
151	}
152	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
153	my $text_options = $secondary_plugin_options->{'TextPlugin'};
154	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
155
156	# if ($self->{'input_encoding'} eq "auto") {
157	# $self->{'input_encoding'} = "utf8";
158	# }
159
160	# if pdftohtml is always producing utf8, then htmlplug always needs this option
161	push(@$html_options,"-input_encoding", "utf8");
162	push(@$html_options,"-extract_language") if $self->{'extract_language'};
163	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
164	# to extract these metadata fields from the HEAD META fields
165	my $required_metadata;
166	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
167	push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
168	} else {
169	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
170	}
171	#push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
172
173	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
174	$self->{'description_tags'} = 1;
175	push(@$html_options,"-description_tags");
176	}
177
178	# following title_sub removes "Page 1" added by pdftohtml, and a leading
179	# "1", which is often the page number at the top of the page. Bad Luck
180	# if your document title actually starts with "1 " - is there a better way?
181	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
182	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
183
184	my $associate_tail_re = $self->{'associate_tail_re'};
185	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
186	push(@$html_options, "-associate_tail_re", $associate_tail_re);
187	push(@$text_options, "-associate_tail_re", $associate_tail_re);
188	push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options;
189	}
190
191	push(@$html_options, "-file_rename_method", "none");
192	push(@$text_options, "-file_rename_method", "none");
193	push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options;
194
195	$self = bless $self, $class;
196	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
197	return $self;
198	}
199
200	sub get_default_process_exp {
201	my $self = shift (@_);
202
203	return q^(?i)\.pdf$^;
204	}
205
206	# so we don't inherit HTMLPlug's block exp...
207	sub get_default_block_exp {
208	return "";
209	}
210
211	sub convert_post_process
212	{
213	my $self = shift (@_);
214	my ($conv_filename) = @_;
215
216	my $outhandle=$self->{'outhandle'};
217
218	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
219	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
220
221	# read in file ($text will be in utf8)
222	my $text = "";
223	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
224	$self->read_file ($conv_filename, "utf8", "", \$text);
225
226	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
227	# for each page). Metadata based on this calculation not set until process()
228	#
229	# Note: this is done even if we are not breaking to document into pages as it might
230	# be useful to give an indication of document length in browser through setting
231	# num_pages as metadata.
232	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
233	my $num_pages = scalar(@pages);
234	$self->{'num_pages'} = $num_pages;
235
236	if ($self->{'use_sections'}
237	&& $self->{'converted_to'} eq "HTML") {
238
239	print $outhandle "PDFPlugin: Calculating sections...\n";
240
241	# we have "<a name=1></a>" etc for each page
242	# it may be <A name=
243	my @sections = split('<[Aa] name=', $text);
244
245	my $top_section = "";
246
247	if (scalar (@sections) == 1) { #only one section - no split!
248	print $outhandle "PDFPlugin: warning - no sections found\n";
249	} else {
250	$top_section .= shift @sections; # keep HTML header etc as top_section
251	}
252
253	# handle first section specially for title? Or all use first 100...
254
255	my $title = $sections[0];
256	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
257	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
258	$title =~ s/<[^>]*>/ /g;
259	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
260	$title =~ s/^\s+//s;
261	$title =~ s/\s+$//;
262	$title =~ s/\s+/ /gs;
263	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
264	$title =~ s/^\s+//s; # in case title_sub introduced any...
265	$title = substr ($title, 0, 100);
266	$title =~ s/\s\S*$/.../;
267
268
269	if (scalar (@sections) == 1) { # no sections found
270	$top_section .= $sections[0];
271	@sections=();
272	} else {
273	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
274	}
275
276	# add metadata per section...
277	foreach my $section (@sections) {
278	# section names are not always just digits, may be like "outline"
279	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
280
281	$title = $1; # Greenstone does magic if sections are titled digits
282	if (! defined($title) ) {
283	print STDERR "no title: $section\n";
284	$title = " "; # get rid of the undefined warning in next line
285	}
286	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
287	$newsection .= "<Metadata name=\"Title\">" . $title
288	. "</Metadata>\n--><p>\n";
289	$newsection .= $section;
290	$newsection .= "<!--</Section>-->\n";
291	$section = $newsection;
292	}
293
294	$text=join('', ($top_section, @sections));
295	}
296
297	# turn any high bytes that aren't valid utf-8 into utf-8.
298	unicode::ensure_utf8(\$text);
299
300	# Write it out again!
301	$self->utf8_write_file (\$text, $conv_filename);
302	}
303
304
305	# do plugin specific processing of doc_obj for HTML type
306	sub process {
307	my $self = shift (@_);
308	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
309
310	my $result = $self->process_type($base_dir,$file,$doc_obj);
311
312	# fix up the extracted date metadata to be in Greenstone date format,
313	# and fix the capitalisation of 'date'
314	my $cursection = $doc_obj->get_top_section();
315	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
316	$doc_obj->delete_metadata($cursection, "date", $datemeta);
317
318	# We're just interested in the date bit, not the time
319	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
320	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
321	# extracts the ModDate, so it is 0...
322	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
323	my ($year, $month, $day) = ($1,$2,$3);
324	if (defined($year) && defined($month) && defined($day)) {
325	if ($year == 0) {next}
326	if ($year < 100) {$year += 1900} # just to be safe
327	if ($month =~ /^\d$/) {$month="0$month"} # single digit
328	if ($day =~ /^\d$/) {$day="0$day"} # single digit
329	my $date="$year$month$day";
330	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
331	}
332	}
333
334	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
335
336	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
337	# we explicitly make it a paged document, cos greenstone won't get it
338	# right if any section has an empty title, or one with letters in it
339	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
340	}
341
342	return $result;
343	}
344
345	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: