Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 10434

Last change on this file since 10434 was 10429, checked in by chi, 19 years ago
Modification of the way passing argument and option lists for the secondary plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.8 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlug;
26
27	use ConvertToPlug;
28	use unicode;
29	use strict;
30	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
31
32	sub BEGIN {
33	@PDFPlug::ISA = ('ConvertToPlug');
34	}
35
36	my $arguments =
37	[ { 'name' => "process_exp",
38	'desc' => "{BasPlug.process_exp}",
39	'type' => "regexp",
40	'deft' => &get_default_process_exp(),
41	'reqd' => "no" },
42	{ 'name' => "block_exp",
43	'desc' => "{BasPlug.block_exp}",
44	'type' => "regexp",
45	'deft' => &get_default_block_exp() },
46	{ 'name' => "noimages",
47	'desc' => "{PDFPlug.noimages}",
48	'type' => "flag" },
49	{ 'name' => "complex",
50	'desc' => "{PDFPlug.complex}",
51	'type' => "flag" },
52	{ 'name' => "nohidden",
53	'desc' => "{PDFPlug.nohidden}",
54	'type' => "flag" },
55	{ 'name' => "zoom",
56	'desc' => "{PDFPlug.zoom}",
57	'deft' => "2",
58	'range' => "1,3", # actually the range is 0.5-3
59	'type' => "int" },
60	{ 'name' => "use_sections",
61	'desc' => "{PDFPlug.use_sections}",
62	'type' => "flag" } ];
63
64	my $options = { 'name' => "PDFPlug",
65	'desc' => "{PDFPlug.desc}",
66	'abstract' => "no",
67	'inherits' => "yes",
68	'args' => $arguments };
69
70	sub new {
71	my ($class) = shift (@_);
72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73	push(@$pluginlist, $class);
74
75	push(@$inputargs,"-title_sub");
76	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
77
78	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
79	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
80
81	my @arg_array = @$inputargs;
82	my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
83
84	# these are passed through to gsConvert.pl by ConvertToPlug.pm
85	my $zoom = $self->{"zoom"};
86	$self->{'convert_options'} = "-pdf_zoom $zoom";
87	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
88	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
89	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
90
91	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
92
93	if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
94	$secondary_plugin_options->{'HTMLPlug'} = [];
95	}
96	if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
97	$secondary_plugin_options->{'TEXTPlug'} = [];
98	}
99	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
100	if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
101	$secondary_plugin_options->{'PagedImgPlug'} = [];
102	my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
103	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
104	}
105	}
106	my $html_options = $secondary_plugin_options->{'HTMLPlug'};
107	my $text_options = $secondary_plugin_options->{'TEXTPlug'};
108
109	if ($self->{'input_encoding'} eq "auto") {
110	# pdftohtml will always produce html files encoded as utf-8
111	# => restrict primary PDFPlug and secondary HTML plugin to use
112	# utf8 and extract language.
113	$self->{'input_encoding'} = "utf8";
114	$self->{'extract_language'} = 1;
115
116	push(@$html_options,"-input_encoding", "utf8");
117	push(@$html_options,"-extract_language");
118	}
119
120	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
121	# to extract these metadata fields from the HEAD META fields
122	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
123
124	if ($self->{'use_sections'}) {
125	$self->{'description_tags'} = 1;
126	push(@$html_options,"-description_tags");
127	}
128
129	# following title_sub removes "Page 1" added by pdftohtml, and a leading
130	# "1", which is often the page number at the top of the page. Bad Luck
131	# if your document title actually starts with "1 " - is there a better way?
132	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
133	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
134
135	$self = bless $self, $class;
136	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
137	return $self;
138	}
139
140	sub get_default_process_exp {
141	my $self = shift (@_);
142
143	return q^(?i)\.pdf$^;
144	}
145
146	# so we don't inherit HTMLPlug's block exp...
147	sub get_default_block_exp {
148	return "";
149	}
150
151	sub convert_post_process
152	{
153	my $self = shift (@_);
154	my ($conv_filename) = @_;
155
156	my $outhandle=$self->{'outhandle'};
157
158	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
159
160	# read in file ($text will be in utf8)
161	my $text = "";
162	$self->read_file ($conv_filename, $encoding, $language, \$text);
163
164	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
165	# for each page). Metadata based on this calculation not set until process()
166	#
167	# Note: this is done even if we are not breaking to document into pages as it might
168	# be useful to give an indication of document length in browser through setting
169	# num_pages as metadata.
170	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
171	my $num_pages = scalar(@pages);
172	$self->{'num_pages'} = $num_pages;
173
174	if ($self->{'use_sections'}
175	&& $self->{'converted_to'} eq "HTML") {
176
177	print $outhandle "PDFPlug: Calculating sections...\n";
178
179	# we have "<a name=1></a>" etc for each page
180	# it may be <A name=
181	my @sections = split('<[Aa] name=', $text);
182
183	my $top_section = "";
184
185	if (scalar (@sections) == 1) { #only one section - no split!
186	print $outhandle "PDFPlug: warning - no sections found\n";
187	} else {
188	$top_section .= shift @sections; # keep HTML header etc as top_section
189	}
190
191	# handle first section specially for title? Or all use first 100...
192
193	my $title = $sections[0];
194	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
195	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
196	$title =~ s/<[^>]*>/ /g;
197	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
198	$title =~ s/^\s+//s;
199	$title =~ s/\s+$//;
200	$title =~ s/\s+/ /gs;
201	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
202	$title =~ s/^\s+//s; # in case title_sub introduced any...
203	$title = substr ($title, 0, 100);
204	$title =~ s/\s\S*$/.../;
205
206
207	if (scalar (@sections) == 1) { # no sections found
208	$top_section .= $sections[0];
209	@sections=();
210	} else {
211	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
212	}
213
214	# add metadata per section...
215	foreach my $section (@sections) {
216	# section names are not always just digits, may be like "outline"
217	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
218
219	$title = $1; # Greenstone does magic if sections are titled digits
220	if (! defined($title) ) {
221	print STDERR "no title: $section\n";
222	$title = " "; # get rid of the undefined warning in next line
223	}
224	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
225	$newsection .= "<Metadata name=\"Title\">" . $title
226	. "</Metadata>\n--><p>\n";
227	$newsection .= $section;
228	$newsection .= "<!--</Section>-->\n";
229	$section = $newsection;
230	}
231
232	$text=join('', ($top_section, @sections));
233	}
234
235	# turn any high bytes that aren't valid utf-8 into utf-8.
236	unicode::ensure_utf8(\$text);
237
238	# Write it out again!
239	$self->utf8_write_file (\$text, $conv_filename);
240	}
241
242
243	# do plugin specific processing of doc_obj for HTML type
244	sub process {
245	my $self = shift (@_);
246	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
247
248	my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
249
250	# fix up the extracted date metadata to be in Greenstone date format,
251	# and fix the capitalisation of 'date'
252	my $cursection = $doc_obj->get_top_section();
253
254	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
255	$doc_obj->delete_metadata($cursection, "date", $datemeta);
256
257	# We're just interested in the date bit, not the time
258	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
259	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
260	# extracts the ModDate, so it is 0...
261	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
262	my ($year, $month, $day) = ($1,$2,$3);
263	if (defined($year) && defined($month) && defined($day)) {
264	if ($year == 0) {next}
265	if ($year < 100) {$year += 1900} # just to be safe
266	if ($month =~ /^\d$/) {$month="0$month"} # single digit
267	if ($day =~ /^\d$/) {$day="0$day"} # single digit
268	my $date="$year$month$day";
269	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
270	}
271	}
272
273	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
274
275	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
276	# we explicitly make it a paged document, cos greenstone won't get it
277	# right if any section has an empty title, or one with letters in it
278	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
279	}
280
281	return $result;
282	}
283
284	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: