Context Navigation

source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Line
1	###########################################################################
2	#
3	# PDFPlugin.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlugin;
26
27	use ConvertBinaryFile;
28	use ReadTextFile;
29	use unicode;
30	use strict;
31	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
32
33	sub BEGIN {
34	@PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
35	}
36
37	my $convert_to_list =
38	[ { 'name' => "auto",
39	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
40	{ 'name' => "html",
41	'desc' => "{ConvertBinaryFile.convert_to.html}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
50	];
51
52
53	my $arguments =
54	[
55	{ 'name' => "convert_to",
56	'desc' => "{ConvertBinaryFile.convert_to}",
57	'type' => "enum",
58	'reqd' => "yes",
59	'list' => $convert_to_list,
60	'deft' => "html" },
61	{ 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_process_exp(),
65	'reqd' => "no" },
66	{ 'name' => "block_exp",
67	'desc' => "{BasePlugin.block_exp}",
68	'type' => "regexp",
69	'deft' => &get_default_block_exp() },
70	{ 'name' => "metadata_fields",
71	'desc' => "{HTMLPlugin.metadata_fields}",
72	'type' => "string",
73	'deft' => "" },
74	{ 'name' => "noimages",
75	'desc' => "{PDFPlugin.noimages}",
76	'type' => "flag" },
77	{ 'name' => "allowimagesonly",
78	'desc' => "{PDFPlugin.allowimagesonly}",
79	'type' => "flag" },
80	{ 'name' => "complex",
81	'desc' => "{PDFPlugin.complex}",
82	'type' => "flag" },
83	{ 'name' => "nohidden",
84	'desc' => "{PDFPlugin.nohidden}",
85	'type' => "flag" },
86	{ 'name' => "zoom",
87	'desc' => "{PDFPlugin.zoom}",
88	'deft' => "2",
89	'range' => "1,3", # actually the range is 0.5-3
90	'type' => "int" },
91	{ 'name' => "use_sections",
92	'desc' => "{PDFPlugin.use_sections}",
93	'type' => "flag" },
94	{ 'name' => "description_tags",
95	'desc' => "{HTMLPlugin.description_tags}",
96	'type' => "flag" }
97	];
98
99	my $options = { 'name' => "PDFPlugin",
100	'desc' => "{PDFPlugin.desc}",
101	'abstract' => "no",
102	'inherits' => "yes",
103	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	push(@$inputargs,"-title_sub");
112	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
113
114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115	push(@{$hashArgOptLists->{"OptList"}},$options);
116
117	my @arg_array = @$inputargs;
118	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
119
120	if ($self->{'info_only'}) {
121	# don't worry about any options etc
122	return bless $self, $class;
123	}
124
125	$self->{'filename_extension'} = "pdf";
126	$self->{'file_type'} = "PDF";
127
128	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
129	my $zoom = $self->{"zoom"};
130	$self->{'convert_options'} = "-pdf_zoom $zoom";
131	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
132	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
133	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
134	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
135
136	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
137
138	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
139	$secondary_plugin_options->{'HTMLPlugin'} = [];
140	}
141	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
142	$secondary_plugin_options->{'TextPlugin'} = [];
143	}
144	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
145	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
146	$secondary_plugin_options->{'PagedImagePlugin'} = [];
147	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
148	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
149	}
150	}
151	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
152	my $text_options = $secondary_plugin_options->{'TextPlugin'};
153	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
154
155	if ($self->{'input_encoding'} eq "auto") {
156	$self->{'input_encoding'} = "utf8";
157	}
158
159	# if pdftohtml is always producing utf8, then htmlplug always needs this option
160	push(@$html_options,"-input_encoding", "utf8");
161	push(@$html_options,"-extract_language") if $self->{'extract_language'};
162	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
163	# to extract these metadata fields from the HEAD META fields
164	my $required_metadata;
165	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
166	push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
167	} else {
168	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
169	}
170	#push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
171
172	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
173	$self->{'description_tags'} = 1;
174	push(@$html_options,"-description_tags");
175	}
176
177	# following title_sub removes "Page 1" added by pdftohtml, and a leading
178	# "1", which is often the page number at the top of the page. Bad Luck
179	# if your document title actually starts with "1 " - is there a better way?
180	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
181	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
182
183	my $associate_tail_re = $self->{'associate_tail_re'};
184	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
185	push(@$html_options, "-associate_tail_re", $associate_tail_re);
186	push(@$text_options, "-associate_tail_re", $associate_tail_re);
187	push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re);
188	}
189
190
191	$self = bless $self, $class;
192	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
193	return $self;
194	}
195
196	sub get_default_process_exp {
197	my $self = shift (@_);
198
199	return q^(?i)\.pdf$^;
200	}
201
202	# so we don't inherit HTMLPlug's block exp...
203	sub get_default_block_exp {
204	return "";
205	}
206
207	sub convert_post_process
208	{
209	my $self = shift (@_);
210	my ($conv_filename) = @_;
211
212	my $outhandle=$self->{'outhandle'};
213
214	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
215
216	# read in file ($text will be in utf8)
217	my $text = "";
218	$self->read_file ($conv_filename, $encoding, $language, \$text);
219
220	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
221	# for each page). Metadata based on this calculation not set until process()
222	#
223	# Note: this is done even if we are not breaking to document into pages as it might
224	# be useful to give an indication of document length in browser through setting
225	# num_pages as metadata.
226	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
227	my $num_pages = scalar(@pages);
228	$self->{'num_pages'} = $num_pages;
229
230	if ($self->{'use_sections'}
231	&& $self->{'converted_to'} eq "HTML") {
232
233	print $outhandle "PDFPlugin: Calculating sections...\n";
234
235	# we have "<a name=1></a>" etc for each page
236	# it may be <A name=
237	my @sections = split('<[Aa] name=', $text);
238
239	my $top_section = "";
240
241	if (scalar (@sections) == 1) { #only one section - no split!
242	print $outhandle "PDFPlugin: warning - no sections found\n";
243	} else {
244	$top_section .= shift @sections; # keep HTML header etc as top_section
245	}
246
247	# handle first section specially for title? Or all use first 100...
248
249	my $title = $sections[0];
250	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
251	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
252	$title =~ s/<[^>]*>/ /g;
253	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
254	$title =~ s/^\s+//s;
255	$title =~ s/\s+$//;
256	$title =~ s/\s+/ /gs;
257	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
258	$title =~ s/^\s+//s; # in case title_sub introduced any...
259	$title = substr ($title, 0, 100);
260	$title =~ s/\s\S*$/.../;
261
262
263	if (scalar (@sections) == 1) { # no sections found
264	$top_section .= $sections[0];
265	@sections=();
266	} else {
267	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
268	}
269
270	# add metadata per section...
271	foreach my $section (@sections) {
272	# section names are not always just digits, may be like "outline"
273	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
274
275	$title = $1; # Greenstone does magic if sections are titled digits
276	if (! defined($title) ) {
277	print STDERR "no title: $section\n";
278	$title = " "; # get rid of the undefined warning in next line
279	}
280	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
281	$newsection .= "<Metadata name=\"Title\">" . $title
282	. "</Metadata>\n--><p>\n";
283	$newsection .= $section;
284	$newsection .= "<!--</Section>-->\n";
285	$section = $newsection;
286	}
287
288	$text=join('', ($top_section, @sections));
289	}
290
291	# turn any high bytes that aren't valid utf-8 into utf-8.
292	unicode::ensure_utf8(\$text);
293
294	# Write it out again!
295	$self->utf8_write_file (\$text, $conv_filename);
296	}
297
298
299	# do plugin specific processing of doc_obj for HTML type
300	sub process {
301	my $self = shift (@_);
302	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
303
304	my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
305
306	# fix up the extracted date metadata to be in Greenstone date format,
307	# and fix the capitalisation of 'date'
308	my $cursection = $doc_obj->get_top_section();
309	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
310	$doc_obj->delete_metadata($cursection, "date", $datemeta);
311
312	# We're just interested in the date bit, not the time
313	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
314	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
315	# extracts the ModDate, so it is 0...
316	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
317	my ($year, $month, $day) = ($1,$2,$3);
318	if (defined($year) && defined($month) && defined($day)) {
319	if ($year == 0) {next}
320	if ($year < 100) {$year += 1900} # just to be safe
321	if ($month =~ /^\d$/) {$month="0$month"} # single digit
322	if ($day =~ /^\d$/) {$day="0$day"} # single digit
323	my $date="$year$month$day";
324	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
325	}
326	}
327
328	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
329
330	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
331	# we explicitly make it a paged document, cos greenstone won't get it
332	# right if any section has an empty title, or one with letters in it
333	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
334	}
335
336	return $result;
337	}
338
339	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: