Context Navigation

source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 20790

Last change on this file since 20790 was 20790, checked in by kjdon, 15 years ago
set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)
Property svn:keywords set to `Author Date Id Revision`
File size: 12.6 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use ConvertBinaryFile;
28	use ReadTextFile;
29	use unicode;
30	use strict;
31	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
32
33	sub BEGIN {
34	@PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
35	}
36
37	my $convert_to_list =
38	[ { 'name' => "auto",
39	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
40	{ 'name' => "html",
41	'desc' => "{ConvertBinaryFile.convert_to.html}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
50	];
51
52
53	my $arguments =
54	[
55	{ 'name' => "convert_to",
56	'desc' => "{ConvertBinaryFile.convert_to}",
57	'type' => "enum",
58	'reqd' => "yes",
59	'list' => $convert_to_list,
60	'deft' => "html" },
61	{ 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_process_exp(),
65	'reqd' => "no" },
66	{ 'name' => "block_exp",
67	'desc' => "{BasePlugin.block_exp}",
68	'type' => "regexp",
69	'deft' => &get_default_block_exp() },
70	{ 'name' => "metadata_fields",
71	'desc' => "{HTMLPlugin.metadata_fields}",
72	'type' => "string",
73	'deft' => "" },
74	{ 'name' => "noimages",
75	'desc' => "{PDFPlugin.noimages}",
76	'type' => "flag" },
77	{ 'name' => "allowimagesonly",
78	'desc' => "{PDFPlugin.allowimagesonly}",
79	'type' => "flag" },
80	{ 'name' => "complex",
81	'desc' => "{PDFPlugin.complex}",
82	'type' => "flag" },
83	{ 'name' => "nohidden",
84	'desc' => "{PDFPlugin.nohidden}",
85	'type' => "flag" },
86	{ 'name' => "zoom",
87	'desc' => "{PDFPlugin.zoom}",
88	'deft' => "2",
89	'range' => "1,3", # actually the range is 0.5-3
90	'type' => "int" },
91	{ 'name' => "use_sections",
92	'desc' => "{PDFPlugin.use_sections}",
93	'type' => "flag" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" }
97	];
98
99	my $options = { 'name' => "PDFPlugin",
100	'desc' => "{PDFPlugin.desc}",
101	'abstract' => "no",
102	'inherits' => "yes",
103	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	push(@$inputargs,"-title_sub");
112	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117	my @arg_array = @$inputargs;
118	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
119
120	if ($self->{'info_only'}) {
121	# don't worry about any options etc
122	return bless $self, $class;
123	}
124
125	$self->{'filename_extension'} = "pdf";
126	$self->{'file_type'} = "PDF";
127
128	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
129	my $zoom = $self->{"zoom"};
130	$self->{'convert_options'} = "-pdf_zoom $zoom";
131	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
132	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
133	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
134	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
135
136	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
137
138	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
139	$secondary_plugin_options->{'HTMLPlugin'} = [];
140	}
141	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
142	$secondary_plugin_options->{'TextPlugin'} = [];
143	}
144	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage\|pagedimg).*/i) {
145	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
146	$secondary_plugin_options->{'PagedImagePlugin'} = [];
147	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
148	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
149	push(@$pagedimg_options, "-screenviewsize", "1000");
150	push(@$pagedimg_options, "-enable_cache");
151	}
152	}
153	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
154	my $text_options = $secondary_plugin_options->{'TextPlugin'};
155	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
156
157	# if ($self->{'input_encoding'} eq "auto") {
158	# $self->{'input_encoding'} = "utf8";
159	# }
160
161	# if pdftohtml is always producing utf8, then htmlplug always needs this option
162	push(@$html_options,"-input_encoding", "utf8");
163	push(@$html_options,"-extract_language") if $self->{'extract_language'};
164
165	push(@$html_options, "-processing_tmp_files");
166	push(@$pagedimg_options, "-processing_tmp_files");
167
168	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
169	# to extract these metadata fields from the HEAD META fields
170	my $required_metadata;
171	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
172	push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
173	} else {
174	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
175	}
176	#push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
177
178	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
179	$self->{'description_tags'} = 1;
180	push(@$html_options,"-description_tags");
181	}
182
183	# following title_sub removes "Page 1" added by pdftohtml, and a leading
184	# "1", which is often the page number at the top of the page. Bad Luck
185	# if your document title actually starts with "1 " - is there a better way?
186	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
187	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
188
189	my $associate_tail_re = $self->{'associate_tail_re'};
190	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
191	push(@$html_options, "-associate_tail_re", $associate_tail_re);
192	push(@$text_options, "-associate_tail_re", $associate_tail_re);
193	push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re) if defined $pagedimg_options;
194	}
195
196	push(@$html_options, "-file_rename_method", "none");
197	push(@$text_options, "-file_rename_method", "none");
198	push(@$pagedimg_options, "-file_rename_method", "none") if defined $pagedimg_options;
199
200	$self = bless $self, $class;
201	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
202	return $self;
203	}
204
205	sub get_default_process_exp {
206	my $self = shift (@_);
207
208	return q^(?i)\.pdf$^;
209	}
210
211	# so we don't inherit HTMLPlug's block exp...
212	sub get_default_block_exp {
213	return "";
214	}
215
216	sub convert_post_process
217	{
218	my $self = shift (@_);
219	my ($conv_filename) = @_;
220
221	my $outhandle=$self->{'outhandle'};
222
223	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
224	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
225
226	# read in file ($text will be in utf8)
227	my $text = "";
228	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
229	$self->read_file ($conv_filename, "utf8", "", \$text);
230
231	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
232	# for each page). Metadata based on this calculation not set until process()
233	#
234	# Note: this is done even if we are not breaking to document into pages as it might
235	# be useful to give an indication of document length in browser through setting
236	# num_pages as metadata.
237	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
238	my $num_pages = scalar(@pages);
239	$self->{'num_pages'} = $num_pages;
240
241	if ($self->{'use_sections'}
242	&& $self->{'converted_to'} eq "HTML") {
243
244	print $outhandle "PDFPlugin: Calculating sections...\n";
245
246	# we have "<a name=1></a>" etc for each page
247	# it may be <A name=
248	my @sections = split('<[Aa] name=', $text);
249
250	my $top_section = "";
251
252	if (scalar (@sections) == 1) { #only one section - no split!
253	print $outhandle "PDFPlugin: warning - no sections found\n";
254	} else {
255	$top_section .= shift @sections; # keep HTML header etc as top_section
256	}
257
258	# handle first section specially for title? Or all use first 100...
259
260	my $title = $sections[0];
261	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
262	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
263	$title =~ s/<[^>]*>/ /g;
264	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
265	$title =~ s/^\s+//s;
266	$title =~ s/\s+$//;
267	$title =~ s/\s+/ /gs;
268	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
269	$title =~ s/^\s+//s; # in case title_sub introduced any...
270	$title = substr ($title, 0, 100);
271	$title =~ s/\s\S*$/.../;
272
273
274	if (scalar (@sections) == 1) { # no sections found
275	$top_section .= $sections[0];
276	@sections=();
277	} else {
278	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
279	}
280
281	# add metadata per section...
282	foreach my $section (@sections) {
283	# section names are not always just digits, may be like "outline"
284	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
285
286	$title = $1; # Greenstone does magic if sections are titled digits
287	if (! defined($title) ) {
288	print STDERR "no title: $section\n";
289	$title = " "; # get rid of the undefined warning in next line
290	}
291	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
292	$newsection .= "<Metadata name=\"Title\">" . $title
293	. "</Metadata>\n--><p>\n";
294	$newsection .= $section;
295	$newsection .= "<!--</Section>-->\n";
296	$section = $newsection;
297	}
298
299	$text=join('', ($top_section, @sections));
300	}
301
302	# turn any high bytes that aren't valid utf-8 into utf-8.
303	unicode::ensure_utf8(\$text);
304
305	# Write it out again!
306	$self->utf8_write_file (\$text, $conv_filename);
307	}
308
309
310	# do plugin specific processing of doc_obj for HTML type
311	sub process {
312	my $self = shift (@_);
313	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
314
315	my $result = $self->process_type($base_dir,$file,$doc_obj);
316
317	# fix up the extracted date metadata to be in Greenstone date format,
318	# and fix the capitalisation of 'date'
319	my $cursection = $doc_obj->get_top_section();
320	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
321	$doc_obj->delete_metadata($cursection, "date", $datemeta);
322
323	# We're just interested in the date bit, not the time
324	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
325	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
326	# extracts the ModDate, so it is 0...
327	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
328	my ($year, $month, $day) = ($1,$2,$3);
329	if (defined($year) && defined($month) && defined($day)) {
330	if ($year == 0) {next}
331	if ($year < 100) {$year += 1900} # just to be safe
332	if ($month =~ /^\d$/) {$month="0$month"} # single digit
333	if ($day =~ /^\d$/) {$day="0$day"} # single digit
334	my $date="$year$month$day";
335	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
336	}
337	}
338
339	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
340
341	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
342	# we explicitly make it a paged document, cos greenstone won't get it
343	# right if any section has an empty title, or one with letters in it
344	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
345	}
346
347	return $result;
348	}
349
350	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: