Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 9465

Last change on this file since 9465 was 9465, checked in by kjdon, 19 years ago
ConvertToPlug now accepts the gli arg in read() and passes it in process(), and all the convert plugins now print a processing message for gli in their process() method.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	use strict;
26	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
27
28	package PDFPlug;
29
30	use ConvertToPlug;
31	use unicode;
32
33	sub BEGIN {
34	@PDFPlug::ISA = ('ConvertToPlug');
35	}
36
37	my $arguments =
38	[ { 'name' => "process_exp",
39	'desc' => "{BasPlug.process_exp}",
40	'type' => "regexp",
41	'deft' => &get_default_process_exp(),
42	'reqd' => "no" },
43	{ 'name' => "block_exp",
44	'desc' => "{BasPlug.block_exp}",
45	'type' => "regexp",
46	'deft' => &get_default_block_exp() },
47	{ 'name' => "noimages",
48	'desc' => "{PDFPlug.noimages}",
49	'type' => "flag" },
50	{ 'name' => "complex",
51	'desc' => "{PDFPlug.complex}",
52	'type' => "flag" },
53	{ 'name' => "nohidden",
54	'desc' => "{PDFPlug.nohidden}",
55	'type' => "flag" },
56	{ 'name' => "zoom",
57	'desc' => "{PDFPlug.zoom}",
58	'deft' => "2",
59	'range' => "1,3", # actually the range is 0.5-3
60	'type' => "int" },
61	{ 'name' => "use_sections",
62	'desc' => "{PDFPlug.use_sections}",
63	'type' => "flag" } ];
64
65	my $options = { 'name' => "PDFPlug",
66	'desc' => "{PDFPlug.desc}",
67	'abstract' => "no",
68	'inherits' => "yes",
69	'args' => $arguments };
70
71	sub new {
72	my $class = shift (@_);
73
74	my ($noimages, $complex, $zoom, $use_sections, $nohidden);
75
76	my @args=@_;
77
78	if (!parsargv::parse(\@_,
79	q^noimages^, \$noimages,
80	q^complex^, \$complex,
81	q^zoom/\d+/2^, \$zoom,
82	q^nohidden^, \$nohidden,
83	q^use_sections^, \$use_sections,
84	"allow_extra_options")) {
85
86	my $self = new ConvertToPlug($class, @_);
87	my $outhandle=$self->{'outhandle'};
88	print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
89	$self->print_txt_usage(""); # Use default resource bundle
90	exit 1;
91	}
92
93
94	if ($use_sections) {
95	push (@args, "-description_tags");
96	}
97
98	# following title_sub removes "Page 1" added by pdftohtml, and a leading
99	# "1", which is often the page number at the top of the page. Bad Luck
100	# if your document title actually starts with "1 " - is there a better way?
101
102	my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
103	$self->{'plugin_type'} = "PDFPlug";
104	if ($use_sections) {
105	$self->{'use_sections'}=1;
106	}
107
108	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
109	my $option_list = $self->{'option_list'};
110	push( @{$option_list}, $options );
111
112	# these are passed through to gsConvert.pl by ConvertToPlug.pm
113	$self->{'convert_options'} = "-pdf_zoom $zoom";
114	$self->{'convert_options'} .= " -pdf_complex" if $complex;
115	$self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
116	$self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
117
118	# pdftohtml will always produce html files encoded as utf-8
119	if ($self->{'input_encoding'} eq "auto") {
120	$self->{'input_encoding'} = "utf8";
121	$self->{'extract_language'} = 1;
122	}
123
124	return bless $self, $class;
125	}
126
127	sub get_default_process_exp {
128	my $self = shift (@_);
129
130	return q^(?i)\.pdf$^;
131	}
132
133	# so we don't inherit HTMLPlug's block exp...
134	sub get_default_block_exp {
135	return "";
136	}
137
138
139	# do plugin specific processing of doc_obj for HTML type
140	sub process {
141	my $self = shift (@_);
142	#my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
143
144	my $outhandle=$self->{'outhandle'};
145
146	my $textref=$_[0];
147
148	if ($self->{'use_sections'}
149	&& $self->{'converted_to'} eq "HTML") {
150
151	print $outhandle "PDFPlug: Calculating sections...\n";
152
153	# we have "<a name=1></a>" etc for each page
154	# it may be <A name=
155	my @sections = split('<[Aa] name=', $$textref);
156
157	if (scalar (@sections) == 1) { #only one section - no split!
158	print $outhandle "PDFPlug: warning - no sections found\n";
159	} else {
160	shift @sections; # don't need HTML header, etc
161	}
162
163	# handle first section specially for title? Or all use first 100...
164
165	my $title = $sections[0];
166	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
167	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
168	$title =~ s/<[^>]*>/ /g;
169	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
170	$title =~ s/^\s+//s;
171	$title =~ s/\s+$//;
172	$title =~ s/\s+/ /gs;
173	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
174	$title =~ s/^\s+//s; # in case title_sub introduced any...
175	$title = substr ($title, 0, 100);
176	$title =~ s/\s\S*$/.../;
177
178	my $top_section;
179	if (scalar (@sections) == 1) { # no sections found
180	$top_section=$sections[0];
181	@sections=();
182	} else {
183	$top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
184	}
185
186	# add metadata per section...
187	foreach my $section (@sections) {
188	# section names are not always just digits, may be like "outline"
189	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
190
191	$title = $1; # Greenstone does magic if sections are titled digits
192	if (! defined($title) ) {
193	print STDERR "no title: $section\n";
194	$title = " "; # get rid of the undefined warning in next line
195	}
196	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
197	$newsection .= "<Metadata name=\"Title\">" . $title
198	. "</Metadata>\n--><p>\n";
199	$newsection .= $section;
200	$newsection .= "<!--</Section>-->\n";
201	$section = $newsection;
202	}
203
204	$$textref=join('', ($top_section, @sections));
205	}
206
207	# turn any high bytes that aren't valid utf-8 into utf-8.
208	unicode::ensure_utf8($textref);
209
210	print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
211	if $self->{'verbosity'} > 1;
212	print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($_[6]);
213
214	# tell htmlplug to extract these metadata fields from the HEAD META fields
215	$self->{'metadata_fields'} .= ",date,author<Creator>";
216
217	my $result = ConvertToPlug::process_type($self,"pdf",@_);
218
219	#my $doc_obj = pop(@_);
220	my $doc_obj = $_[5];
221	# fix up the extracted date metadata to be in Greenstone date format,
222	# and fix the capitalisation of 'date'
223	my $cursection = $doc_obj->get_top_section();
224
225	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
226	$doc_obj->delete_metadata($cursection, "date", $datemeta);
227
228	# We're just interested in the date bit, not the time
229	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
230	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
231	# extracts the ModDate, so it is 0...
232	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
233	my ($year, $month, $day) = ($1,$2,$3);
234	if (defined($year) && defined($month) && defined($day)) {
235	if ($year == 0) {next}
236	if ($year < 100) {$year += 1900} # just to be safe
237	if ($month =~ /^\d$/) {$month="0$month"} # single digit
238	if ($day =~ /^\d$/) {$day="0$day"} # single digit
239	my $date="$year$month$day";
240	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
241	}
242	}
243
244	# Add NumPages metadata (we have "<a name=1>" etc for each page)
245	my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
246	$doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
247
248	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
249	# we explicitly make it a paged document, cos greenstone won't get it
250	# right if any section has an empty title, or one with letters in it
251	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
252	}
253	return $result;
254	}
255
256	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: