Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 3833

Last change on this file since 3833 was 3833, checked in by jrm21, 21 years ago
fixed up parsing the use_sections argument.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.3 KB

Line
1	###########################################################################
2	#
3	# PDFPlug.pm -- reasonably with-it pdf plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PDFPlug;
27
28	use ConvertToPlug;
29
30	sub BEGIN {
31	@ISA = ('ConvertToPlug');
32	}
33
34	my $arguments = [ { 'name' => "process_exp",
35	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
36	'type' => "string",
37	'deft' => q^(?i)\.pdf$^,
38	'reqd' => "no" },
39	{ 'name' => "block_exp",
40	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
41	'type' => 'string',
42	'deft' => q^^ },
43	{ 'name' => "noimages",
44	'desc' => "Don't attempt to extract images from PDF.",
45	'type' => "flag" },
46	{ 'name' => "complex",
47	'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
48	'type' => "flag" },
49	{ 'name' => "zoom",
50	'desc' => "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
51	'type' => "int" }
52	];
53
54	my $options = { 'name' => "PDFPlug",
55	'desc' => "Reasonably with-it pdf plugin.",
56	'inherits' => "yes",
57	'args' => $arguments };
58
59	sub new {
60	my $class = shift (@_);
61
62	my ($noimages, $complex, $zoom, $use_sections);
63
64	if (!parsargv::parse(\@_,
65	q^noimages^, \$noimages,
66	q^complex^, \$complex,
67	q^zoom/\d+/2^, \$zoom,
68	q^use_sections/1?/^, \$use_sections,
69	"allow_extra_options")) {
70
71	print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
72	&print_usage();
73	die "\n";
74	}
75
76
77	my @args=@_;
78	if ($use_sections) {
79	push (@args, "-description_tags");
80	}
81
82	# following title_sub removes "Page 1" added by pdftohtml, and a leading
83	# "1", which is often the page number at the top of the page. Bad Luck
84	# if your document title actually starts with "1 " - is there a better way?
85
86	my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
87
88	if ($use_sections) {
89	$self->{'use_sections'}=1;
90	}
91
92	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
93	my $option_list = $self->{'option_list'};
94	push( @{$option_list}, $options );
95
96	# these are passed through to gsConvert.pl by ConvertToPlug.pm
97	$self->{'convert_options'} = "-pdf_zoom $zoom";
98	$self->{'convert_options'} .= " -pdf_complex" if $complex;
99	$self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
100
101	# pdftohtml will always produce html files encoded as utf-8
102	if ($self->{'input_encoding'} eq "auto") {
103	$self->{'input_encoding'} = "utf8";
104	$self->{'extract_language'} = 1;
105	}
106
107	return bless $self, $class;
108	}
109
110
111	sub print_usage {
112	print STDERR "\n usage: plugin PDFPlug [options]\n\n";
113	print STDERR " options:\n";
114	print STDERR " -convert_to (html\|text) Convert to TEXT or HTML (default html)\n";
115	print STDERR " -use_sections Create a separate section for each page\n";
116	print STDERR " of the PDF file.\n";
117	print STDERR " -noimages Don't attempt to extract images from PDF.\n";
118	print STDERR " -complex Create more complex output. With this option\n";
119	print STDERR " set the output html will look much more like\n";
120	print STDERR " the original PDF file. For this to function\n";
121	print STDERR " properly you Ghostscript installed (for *nix\n";
122	print STDERR " gs should be on your path while for windows\n";
123	print STDERR " you must have gswin32c.exe on your path).\n";
124	print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
125	print STDERR " (this is only useful if -complex is set).\n\n";
126	}
127
128
129
130	sub get_default_process_exp {
131	my $self = shift (@_);
132
133	return q^(?i)\.pdf$^;
134	}
135
136	# so we don't inherit HTMLPlug's block exp...
137	sub get_default_block_exp {
138	return "";
139	}
140
141
142	# do plugin specific processing of doc_obj for HTML type
143	sub process {
144	my $self = shift (@_);
145	if ($self->{'use_sections'}
146	&& $self->{'converted_to'} eq "HTML") {
147
148	print STDERR "PDFPlug: Calculating sections...\n";
149	my $textref=$_[0];
150
151	# we have "<a name=1></a>" etc for each page
152	my @sections = split('<a name=', $$textref);
153
154	shift @sections; # don't need HTML header, etc
155	# handle first section specially for title? Or all use first 100...
156
157	my $title = $sections[0];
158	$title =~ s/^\d+>//; # specific for pdftohtml...
159	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
160	$title =~ s/<[^>]*>/ /g;
161	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
162	$title =~ s/^\s+//s;
163	$title =~ s/\s+$//;
164	$title =~ s/\s+/ /gs;
165	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
166	$title =~ s/^\s+//s; # in case title_sub introduced any...
167	$title = substr ($title, 0, 100);
168	$title =~ s/\s\S*$/.../;
169
170	my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
171
172	# add metadata per section...
173	foreach my $section (@sections) {
174	$section =~ s@^(\d+)></a>@@; # leftover from split expression...
175
176	$title = $1; # Greenstone does magic if sections are titled digits
177	if (! defined($title) ) {
178	print STDERR "no title: $section\n";
179	}
180	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
181	$newsection .= "<Metadata name=\"Title\">" . $title
182	. "</Metadata>\n--><p>\n";
183	$newsection .= $section;
184	$newsection .= "<!--</Section>-->\n";
185	$section = $newsection;
186	}
187
188	$$textref=join('', ($top_section, @sections));
189	}
190
191	my $outhandle = $self->{'outhandle'};
192	print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
193	if $self->{'verbosity'} > 1;
194
195	return ConvertToPlug::process_type($self,"pdf",@_);
196	}
197
198	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: