Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 7243

Last change on this file since 7243 was 7107, checked in by kjdon, 20 years ago
added a range to the zoom arg
Property svn:keywords set to `Author Date Id Revision`
File size: 7.5 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PDFPlug;
27
28	use ConvertToPlug;
29
30	sub BEGIN {
31	@ISA = ('ConvertToPlug');
32	}
33
34	my $arguments =
35	[ { 'name' => "process_exp",
36	'desc' => "{BasPlug.process_exp}",
37	'type' => "regexp",
38	'deft' => &get_default_process_exp(),
39	'reqd' => "no" },
40	{ 'name' => "block_exp",
41	'desc' => "{BasPlug.block_exp}",
42	'type' => "regexp",
43	'deft' => &get_default_block_exp() },
44	{ 'name' => "noimages",
45	'desc' => "{PDFPlug.noimages}",
46	'type' => "flag" },
47	{ 'name' => "complex",
48	'desc' => "{PDFPlug.complex}",
49	'type' => "flag" },
50	{ 'name' => "nohidden",
51	'desc' => "{PDFPlug.nohidden}",
52	'type' => "flag" },
53	{ 'name' => "zoom",
54	'desc' => "{PDFPlug.zoom}",
55	'deft' => "2",
56	'range' => "1,3", # actually the range is 0.5-3
57	'type' => "int" },
58	{ 'name' => "use_sections",
59	'desc' => "{PDFPlug.use_sections}",
60	'type' => "flag" } ];
61
62	my $options = { 'name' => "PDFPlug",
63	'desc' => "{PDFPlug.desc}",
64	'abstract' => "no",
65	'inherits' => "yes",
66	'args' => $arguments };
67
68	sub new {
69	my $class = shift (@_);
70
71	my ($noimages, $complex, $zoom, $use_sections, $nohidden);
72
73	my @args=@_;
74
75	if (!parsargv::parse(\@_,
76	q^noimages^, \$noimages,
77	q^complex^, \$complex,
78	q^zoom/\d+/2^, \$zoom,
79	q^nohidden^, \$nohidden,
80	q^use_sections^, \$use_sections,
81	"allow_extra_options")) {
82
83	local $self = new ConvertToPlug($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
84	my $outhandle=$self->{'outhandle'};
85	print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
86	$self->print_txt_usage(""); # Use default resource bundle
87	exit 1;
88	}
89
90
91	if ($use_sections) {
92	push (@args, "-description_tags");
93	}
94
95	# following title_sub removes "Page 1" added by pdftohtml, and a leading
96	# "1", which is often the page number at the top of the page. Bad Luck
97	# if your document title actually starts with "1 " - is there a better way?
98
99	my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
100	$self->{'plugin_type'} = "PDFPlug";
101	if ($use_sections) {
102	$self->{'use_sections'}=1;
103	}
104
105	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
106	my $option_list = $self->{'option_list'};
107	push( @{$option_list}, $options );
108
109	# these are passed through to gsConvert.pl by ConvertToPlug.pm
110	$self->{'convert_options'} = "-pdf_zoom $zoom";
111	$self->{'convert_options'} .= " -pdf_complex" if $complex;
112	$self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
113	$self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
114
115	# pdftohtml will always produce html files encoded as utf-8
116	if ($self->{'input_encoding'} eq "auto") {
117	$self->{'input_encoding'} = "utf8";
118	$self->{'extract_language'} = 1;
119	}
120
121	return bless $self, $class;
122	}
123
124
125	# sub print_usage {
126	# print STDERR "\n usage: plugin PDFPlug [options]\n\n";
127	# print STDERR " options:\n";
128	# print STDERR " -convert_to (html\|text) Convert to TEXT or HTML (default html)\n";
129	# print STDERR " -use_sections Create a separate section for each page\n";
130	# print STDERR " of the PDF file.\n";
131	# print STDERR " -noimages Don't attempt to extract images from PDF.\n";
132	# print STDERR " -complex Create more complex output. With this option\n";
133	# print STDERR " set the output html will look much more like\n";
134	# print STDERR " the original PDF file. For this to function\n";
135	# print STDERR " properly you Ghostscript installed (for *nix\n";
136	# print STDERR " gs should be on your path while for windows\n";
137	# print STDERR " you must have gswin32c.exe on your path).\n";
138	# print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n";
139	# print STDERR " hidden text. This is only useful if the -complex\n";
140	# print STDERR " option is also set.";
141	# print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
142	# print STDERR " (this is only useful if -complex is set).\n\n";
143	# }
144
145
146
147	sub get_default_process_exp {
148	my $self = shift (@_);
149
150	return q^(?i)\.pdf$^;
151	}
152
153	# so we don't inherit HTMLPlug's block exp...
154	sub get_default_block_exp {
155	return "";
156	}
157
158
159	# do plugin specific processing of doc_obj for HTML type
160	sub process {
161	my $self = shift (@_);
162	my $outhandle=$self->{'outhandle'};
163
164	if ($self->{'use_sections'}
165	&& $self->{'converted_to'} eq "HTML") {
166
167	print $outhandle "PDFPlug: Calculating sections...\n";
168	my $textref=$_[0];
169
170	# we have "<a name=1></a>" etc for each page
171	my @sections = split('<a name=', $$textref);
172
173	if (scalar (@sections) == 1) { #only one section - no split!
174	print $outhandle "PDFPlug: warning - no sections found\n";
175	} else {
176	shift @sections; # don't need HTML header, etc
177	}
178
179	# handle first section specially for title? Or all use first 100...
180
181	my $title = $sections[0];
182	$title =~ s/^\d+>//; # specific for pdftohtml...
183	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
184	$title =~ s/<[^>]*>/ /g;
185	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
186	$title =~ s/^\s+//s;
187	$title =~ s/\s+$//;
188	$title =~ s/\s+/ /gs;
189	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
190	$title =~ s/^\s+//s; # in case title_sub introduced any...
191	$title = substr ($title, 0, 100);
192	$title =~ s/\s\S*$/.../;
193
194	my $top_section;
195	if (scalar (@sections) == 1) { # no sections found
196	$top_section=$sections[0];
197	@sections=();
198	} else {
199	$top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
200	}
201
202	# add metadata per section...
203	foreach my $section (@sections) {
204	$section =~ s@^(\d+)></a>@@; # leftover from split expression...
205
206	$title = $1; # Greenstone does magic if sections are titled digits
207	if (! defined($title) ) {
208	print STDERR "no title: $section\n";
209	}
210	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
211	$newsection .= "<Metadata name=\"Title\">" . $title
212	. "</Metadata>\n--><p>\n";
213	$newsection .= $section;
214	$newsection .= "<!--</Section>-->\n";
215	$section = $newsection;
216	}
217
218	$$textref=join('', ($top_section, @sections));
219	}
220
221	print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
222	if $self->{'verbosity'} > 1;
223
224	return ConvertToPlug::process_type($self,"pdf",@_);
225	}
226
227	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: