Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 8795

Last change on this file since 8795 was 8795, checked in by kjdon, 19 years ago
if use_sections is on, now we are a bit more relaxed about what the sections are split on. used to be <a name=, now its <[Aa] name=, and instead of matching \d+ for the title, we use \"?\w+\"?, cos sometimes you get <a name="outline"> And since we can now have non-digit titles in the document, we explicitly set gsdlthistype to be paged, otherwise greenstone will treat it as hierarchical if there are non-digit titles.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	use strict;
26	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
27
28	package PDFPlug;
29
30	use ConvertToPlug;
31	use unicode;
32
33	sub BEGIN {
34	@PDFPlug::ISA = ('ConvertToPlug');
35	}
36
37	my $arguments =
38	[ { 'name' => "process_exp",
39	'desc' => "{BasPlug.process_exp}",
40	'type' => "regexp",
41	'deft' => &get_default_process_exp(),
42	'reqd' => "no" },
43	{ 'name' => "block_exp",
44	'desc' => "{BasPlug.block_exp}",
45	'type' => "regexp",
46	'deft' => &get_default_block_exp() },
47	{ 'name' => "noimages",
48	'desc' => "{PDFPlug.noimages}",
49	'type' => "flag" },
50	{ 'name' => "complex",
51	'desc' => "{PDFPlug.complex}",
52	'type' => "flag" },
53	{ 'name' => "nohidden",
54	'desc' => "{PDFPlug.nohidden}",
55	'type' => "flag" },
56	{ 'name' => "zoom",
57	'desc' => "{PDFPlug.zoom}",
58	'deft' => "2",
59	'range' => "1,3", # actually the range is 0.5-3
60	'type' => "int" },
61	{ 'name' => "use_sections",
62	'desc' => "{PDFPlug.use_sections}",
63	'type' => "flag" } ];
64
65	my $options = { 'name' => "PDFPlug",
66	'desc' => "{PDFPlug.desc}",
67	'abstract' => "no",
68	'inherits' => "yes",
69	'args' => $arguments };
70
71	sub new {
72	my $class = shift (@_);
73
74	my ($noimages, $complex, $zoom, $use_sections, $nohidden);
75
76	my @args=@_;
77
78	if (!parsargv::parse(\@_,
79	q^noimages^, \$noimages,
80	q^complex^, \$complex,
81	q^zoom/\d+/2^, \$zoom,
82	q^nohidden^, \$nohidden,
83	q^use_sections^, \$use_sections,
84	"allow_extra_options")) {
85
86	my $self = new ConvertToPlug($class, @_);
87	my $outhandle=$self->{'outhandle'};
88	print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
89	$self->print_txt_usage(""); # Use default resource bundle
90	exit 1;
91	}
92
93
94	if ($use_sections) {
95	push (@args, "-description_tags");
96	}
97
98	# following title_sub removes "Page 1" added by pdftohtml, and a leading
99	# "1", which is often the page number at the top of the page. Bad Luck
100	# if your document title actually starts with "1 " - is there a better way?
101
102	my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
103	$self->{'plugin_type'} = "PDFPlug";
104	if ($use_sections) {
105	$self->{'use_sections'}=1;
106	}
107
108	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
109	my $option_list = $self->{'option_list'};
110	push( @{$option_list}, $options );
111
112	# these are passed through to gsConvert.pl by ConvertToPlug.pm
113	$self->{'convert_options'} = "-pdf_zoom $zoom";
114	$self->{'convert_options'} .= " -pdf_complex" if $complex;
115	$self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
116	$self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
117
118	# pdftohtml will always produce html files encoded as utf-8
119	if ($self->{'input_encoding'} eq "auto") {
120	$self->{'input_encoding'} = "utf8";
121	$self->{'extract_language'} = 1;
122	}
123
124	return bless $self, $class;
125	}
126
127	sub get_default_process_exp {
128	my $self = shift (@_);
129
130	return q^(?i)\.pdf$^;
131	}
132
133	# so we don't inherit HTMLPlug's block exp...
134	sub get_default_block_exp {
135	return "";
136	}
137
138
139	# do plugin specific processing of doc_obj for HTML type
140	sub process {
141	my $self = shift (@_);
142	my $outhandle=$self->{'outhandle'};
143
144	my $textref=$_[0];
145
146	if ($self->{'use_sections'}
147	&& $self->{'converted_to'} eq "HTML") {
148
149	print $outhandle "PDFPlug: Calculating sections...\n";
150
151	# we have "<a name=1></a>" etc for each page
152	# it may be <A name=
153	my @sections = split('<[Aa] name=', $$textref);
154
155	if (scalar (@sections) == 1) { #only one section - no split!
156	print $outhandle "PDFPlug: warning - no sections found\n";
157	} else {
158	shift @sections; # don't need HTML header, etc
159	}
160
161	# handle first section specially for title? Or all use first 100...
162
163	my $title = $sections[0];
164	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
165	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
166	$title =~ s/<[^>]*>/ /g;
167	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
168	$title =~ s/^\s+//s;
169	$title =~ s/\s+$//;
170	$title =~ s/\s+/ /gs;
171	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
172	$title =~ s/^\s+//s; # in case title_sub introduced any...
173	$title = substr ($title, 0, 100);
174	$title =~ s/\s\S*$/.../;
175
176	my $top_section;
177	if (scalar (@sections) == 1) { # no sections found
178	$top_section=$sections[0];
179	@sections=();
180	} else {
181	$top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
182	}
183
184	# add metadata per section...
185	foreach my $section (@sections) {
186	# section names are not always just digits, may be like "outline"
187	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
188
189	$title = $1; # Greenstone does magic if sections are titled digits
190	if (! defined($title) ) {
191	print STDERR "no title: $section\n";
192	$title = " "; # get rid of the undefined warning in next line
193	}
194	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
195	$newsection .= "<Metadata name=\"Title\">" . $title
196	. "</Metadata>\n--><p>\n";
197	$newsection .= $section;
198	$newsection .= "<!--</Section>-->\n";
199	$section = $newsection;
200	}
201
202	$$textref=join('', ($top_section, @sections));
203	}
204
205	# turn any high bytes that aren't valid utf-8 into utf-8.
206	unicode::ensure_utf8($textref);
207
208	print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
209	if $self->{'verbosity'} > 1;
210
211	# tell htmlplug to extract these metadata fields from the HEAD META fields
212	$self->{'metadata_fields'} .= ",date,author<Creator>";
213
214	my $result = ConvertToPlug::process_type($self,"pdf",@_);
215
216	my $doc_obj = pop(@_);
217
218	# fix up the extracted date metadata to be in Greenstone date format,
219	# and fix the capitalisation of 'date'
220	my $cursection = $doc_obj->get_top_section();
221
222	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
223	$doc_obj->delete_metadata($cursection, "date", $datemeta);
224
225	# We're just interested in the date bit, not the time
226	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
227	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
228	# extracts the ModDate, so it is 0...
229	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
230	my ($year, $month, $day) = ($1,$2,$3);
231	if (defined($year) && defined($month) && defined($day)) {
232	if ($year == 0) {next}
233	if ($year < 100) {$year += 1900} # just to be safe
234	if ($month =~ /^\d$/) {$month="0$month"} # single digit
235	if ($day =~ /^\d$/) {$day="0$day"} # single digit
236	my $date="$year$month$day";
237	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
238	}
239	}
240
241	# Add NumPages metadata (we have "<a name=1>" etc for each page)
242	my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
243	$doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
244
245	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
246	# we explicitly make it a paged document, cos greenstone won't get it
247	# right if any section has an empty title, or one with letters in it
248	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
249	}
250	return $result;
251	}
252
253	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: