Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 10419

Last change on this file since 10419 was 10353, checked in by chi, 19 years ago
Modification for allowing PDF document being converted to various types of image through convert utility. Then, allow secondary_plugin PagedImgPlug to deal with the document.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.9 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25	package PDFPlug;
26
27	use ConvertToPlug;
28	use unicode;
29	use strict;
30	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
31
32	sub BEGIN {
33	@PDFPlug::ISA = ('ConvertToPlug');
34	}
35
36	my $arguments =
37	[ { 'name' => "process_exp",
38	'desc' => "{BasPlug.process_exp}",
39	'type' => "regexp",
40	'deft' => &get_default_process_exp(),
41	'reqd' => "no" },
42	{ 'name' => "block_exp",
43	'desc' => "{BasPlug.block_exp}",
44	'type' => "regexp",
45	'deft' => &get_default_block_exp() },
46	{ 'name' => "noimages",
47	'desc' => "{PDFPlug.noimages}",
48	'type' => "flag" },
49	{ 'name' => "complex",
50	'desc' => "{PDFPlug.complex}",
51	'type' => "flag" },
52	{ 'name' => "nohidden",
53	'desc' => "{PDFPlug.nohidden}",
54	'type' => "flag" },
55	{ 'name' => "zoom",
56	'desc' => "{PDFPlug.zoom}",
57	'deft' => "2",
58	'range' => "1,3", # actually the range is 0.5-3
59	'type' => "int" },
60	{ 'name' => "use_sections",
61	'desc' => "{PDFPlug.use_sections}",
62	'type' => "flag" } ];
63
64	my $options = { 'name' => "PDFPlug",
65	'desc' => "{PDFPlug.desc}",
66	'abstract' => "no",
67	'inherits' => "yes",
68	'args' => $arguments };
69
70	sub new {
71	my ($class) = shift (@_);
72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73	push(@$pluginlist, $class);
74
75	push(@$inputargs,"-title_sub");
76	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
77
78	#foreach my $temp (@$inputargs)
79	#{
80	#print STDERR "($temp)\n";
81	#}
82
83	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
84	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
85
86	my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
87
88	# these are passed through to gsConvert.pl by ConvertToPlug.pm
89	my $zoom = $self->{"zoom"};
90	$self->{'convert_options'} = "-pdf_zoom $zoom";
91	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
92	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
93	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
94
95	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
96
97	if (!defined $secondary_plugin_options->{'HTMLPlug'}) {
98	$secondary_plugin_options->{'HTMLPlug'} = [];
99	}
100	if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
101	$secondary_plugin_options->{'TEXTPlug'} = [];
102	}
103	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
104	if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
105	$secondary_plugin_options->{'PagedImgPlug'} = [];
106	my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
107	push(@$pagedimg_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?')
108
109	}
110	}
111	my $html_options = $secondary_plugin_options->{'HTMLPlug'};
112	my $text_options = $secondary_plugin_options->{'TEXTPlug'};
113	#my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
114
115	if ($self->{'input_encoding'} eq "auto") {
116	# pdftohtml will always produce html files encoded as utf-8
117	# => restrict primary PDFPlug and secondary HTML plugin to use
118	# utf8 and extract language.
119	$self->{'input_encoding'} = "utf8";
120	$self->{'extract_language'} = 1;
121
122	push(@$html_options,"-input_encoding", "utf8");
123	push(@$html_options,"-extract_language");
124	}
125
126	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
127	# to extract these metadata fields from the HEAD META fields
128	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
129
130	if ($self->{'use_sections'}) {
131	$self->{'description_tags'} = 1;
132	push(@$html_options,"-description_tags");
133	}
134	# following title_sub removes "Page 1" added by pdftohtml, and a leading
135	# "1", which is often the page number at the top of the page. Bad Luck
136	# if your document title actually starts with "1 " - is there a better way?
137	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
138	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
139	#push(@$pagedimg_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
140
141	$self = bless $self, $class;
142	$self->load_secondary_plugins($class,$secondary_plugin_options);
143	return $self;
144	}
145
146	sub get_default_process_exp {
147	my $self = shift (@_);
148
149	return q^(?i)\.pdf$^;
150	}
151
152	# so we don't inherit HTMLPlug's block exp...
153	sub get_default_block_exp {
154	return "";
155	}
156
157	sub convert_post_process
158	{
159	my $self = shift (@_);
160	my ($conv_filename) = @_;
161
162	my $outhandle=$self->{'outhandle'};
163
164	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
165
166	# read in file ($text will be in utf8)
167	my $text = "";
168	$self->read_file ($conv_filename, $encoding, $language, \$text);
169
170	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
171	# for each page). Metadata based on this calculation not set until process()
172	#
173	# Note: this is done even if we are not breaking to document into pages as it might
174	# be useful to give an indication of document length in browser through setting
175	# num_pages as metadata.
176	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
177	my $num_pages = scalar(@pages);
178	$self->{'num_pages'} = $num_pages;
179
180	if ($self->{'use_sections'}
181	&& $self->{'converted_to'} eq "HTML") {
182
183	print $outhandle "PDFPlug: Calculating sections...\n";
184
185	# we have "<a name=1></a>" etc for each page
186	# it may be <A name=
187	my @sections = split('<[Aa] name=', $text);
188
189	my $top_section = "";
190
191	if (scalar (@sections) == 1) { #only one section - no split!
192	print $outhandle "PDFPlug: warning - no sections found\n";
193	} else {
194	$top_section .= shift @sections; # keep HTML header etc as top_section
195	}
196
197	# handle first section specially for title? Or all use first 100...
198
199	my $title = $sections[0];
200	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
201	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
202	$title =~ s/<[^>]*>/ /g;
203	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
204	$title =~ s/^\s+//s;
205	$title =~ s/\s+$//;
206	$title =~ s/\s+/ /gs;
207	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
208	$title =~ s/^\s+//s; # in case title_sub introduced any...
209	$title = substr ($title, 0, 100);
210	$title =~ s/\s\S*$/.../;
211
212
213	if (scalar (@sections) == 1) { # no sections found
214	$top_section .= $sections[0];
215	@sections=();
216	} else {
217	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
218	}
219
220	# add metadata per section...
221	foreach my $section (@sections) {
222	# section names are not always just digits, may be like "outline"
223	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
224
225	$title = $1; # Greenstone does magic if sections are titled digits
226	if (! defined($title) ) {
227	print STDERR "no title: $section\n";
228	$title = " "; # get rid of the undefined warning in next line
229	}
230	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
231	$newsection .= "<Metadata name=\"Title\">" . $title
232	. "</Metadata>\n--><p>\n";
233	$newsection .= $section;
234	$newsection .= "<!--</Section>-->\n";
235	$section = $newsection;
236	}
237
238	$text=join('', ($top_section, @sections));
239	}
240
241	# turn any high bytes that aren't valid utf-8 into utf-8.
242	unicode::ensure_utf8(\$text);
243
244	# Write it out again!
245	$self->utf8_write_file (\$text, $conv_filename);
246	}
247
248
249	# do plugin specific processing of doc_obj for HTML type
250	sub process {
251	my $self = shift (@_);
252	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
253
254	my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
255
256	# fix up the extracted date metadata to be in Greenstone date format,
257	# and fix the capitalisation of 'date'
258	my $cursection = $doc_obj->get_top_section();
259
260	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
261	$doc_obj->delete_metadata($cursection, "date", $datemeta);
262
263	# We're just interested in the date bit, not the time
264	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
265	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
266	# extracts the ModDate, so it is 0...
267	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
268	my ($year, $month, $day) = ($1,$2,$3);
269	if (defined($year) && defined($month) && defined($day)) {
270	if ($year == 0) {next}
271	if ($year < 100) {$year += 1900} # just to be safe
272	if ($month =~ /^\d$/) {$month="0$month"} # single digit
273	if ($day =~ /^\d$/) {$day="0$day"} # single digit
274	my $date="$year$month$day";
275	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
276	}
277	}
278
279	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
280
281	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
282	# we explicitly make it a paged document, cos greenstone won't get it
283	# right if any section has an empty title, or one with letters in it
284	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
285	}
286
287	return $result;
288	}
289
290	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: