source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 4748

Last change on this file since 4748 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.9 KB
Line 
1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PDFPlug;
27
28use ConvertToPlug;
29
30sub BEGIN {
31 @ISA = ('ConvertToPlug');
32}
33
34my $arguments =
35 [ { 'name' => "process_exp",
36 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
37 'type' => "string",
38 'deft' => &get_default_process_exp(),
39 'reqd' => "no" },
40 { 'name' => "block_exp",
41 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
42 'type' => "string",
43 'deft' => q^^ },
44 { 'name' => "noimages",
45 'desc' => "Don't attempt to extract images from PDF.",
46 'type' => "flag" },
47 { 'name' => "complex",
48 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
49 'type' => "flag" },
50 { 'name' => "nohidden",
51 'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.",
52 'type' => "flag" },
53 { 'name' => "zoom",
54 'desc' => "The factor by which to zoom the PDF for output (this is only useful if -complex is set).",
55 'deft' => "2",
56 'type' => "int" },
57 { 'name' => "use_sections",
58 'desc' => "Create a separate section for each page of the PDF file.",
59 'type' => "flag" } ];
60
61my $options = { 'name' => "PDFPlug",
62 'desc' => "Reasonably with-it pdf plugin.",
63 'inherits' => "yes",
64 'args' => $arguments };
65
66sub new {
67 my $class = shift (@_);
68
69 my ($noimages, $complex, $zoom, $use_sections, $nohidden);
70
71 if (!parsargv::parse(\@_,
72 q^noimages^, \$noimages,
73 q^complex^, \$complex,
74 q^zoom/\d+/2^, \$zoom,
75 q^nohidden^, \$nohidden,
76 q^use_sections/1?/^, \$use_sections,
77 "allow_extra_options")) {
78
79 print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
80 &print_usage();
81 die "\n";
82 }
83
84
85 my @args=@_;
86 if ($use_sections) {
87 push (@args, "-description_tags");
88 }
89
90 # following title_sub removes "Page 1" added by pdftohtml, and a leading
91 # "1", which is often the page number at the top of the page. Bad Luck
92 # if your document title actually starts with "1 " - is there a better way?
93
94 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
95
96 if ($use_sections) {
97 $self->{'use_sections'}=1;
98 }
99
100 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
101 my $option_list = $self->{'option_list'};
102 push( @{$option_list}, $options );
103
104 # these are passed through to gsConvert.pl by ConvertToPlug.pm
105 $self->{'convert_options'} = "-pdf_zoom $zoom";
106 $self->{'convert_options'} .= " -pdf_complex" if $complex;
107 $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
108 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
109
110 # pdftohtml will always produce html files encoded as utf-8
111 if ($self->{'input_encoding'} eq "auto") {
112 $self->{'input_encoding'} = "utf8";
113 $self->{'extract_language'} = 1;
114 }
115
116 return bless $self, $class;
117}
118
119
120sub print_usage {
121 print STDERR "\n usage: plugin PDFPlug [options]\n\n";
122 print STDERR " options:\n";
123 print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
124 print STDERR " -use_sections Create a separate section for each page\n";
125 print STDERR " of the PDF file.\n";
126 print STDERR " -noimages Don't attempt to extract images from PDF.\n";
127 print STDERR " -complex Create more complex output. With this option\n";
128 print STDERR " set the output html will look much more like\n";
129 print STDERR " the original PDF file. For this to function\n";
130 print STDERR " properly you Ghostscript installed (for *nix\n";
131 print STDERR " gs should be on your path while for windows\n";
132 print STDERR " you must have gswin32c.exe on your path).\n";
133 print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n";
134 print STDERR " hidden text. This is only useful if the -complex\n";
135 print STDERR " option is also set.";
136 print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
137 print STDERR " (this is only useful if -complex is set).\n\n";
138}
139
140
141
142sub get_default_process_exp {
143 my $self = shift (@_);
144
145 return q^(?i)\.pdf$^;
146}
147
148# so we don't inherit HTMLPlug's block exp...
149sub get_default_block_exp {
150 return "";
151}
152
153
154# do plugin specific processing of doc_obj for HTML type
155sub process {
156 my $self = shift (@_);
157 if ($self->{'use_sections'}
158 && $self->{'converted_to'} eq "HTML") {
159
160 print STDERR "PDFPlug: Calculating sections...\n";
161 my $textref=$_[0];
162
163 # we have "<a name=1></a>" etc for each page
164 my @sections = split('<a name=', $$textref);
165
166 shift @sections; # don't need HTML header, etc
167 # handle first section specially for title? Or all use first 100...
168
169 my $title = $sections[0];
170 $title =~ s/^\d+>//; # specific for pdftohtml...
171 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
172 $title =~ s/<[^>]*>/ /g;
173 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
174 $title =~ s/^\s+//s;
175 $title =~ s/\s+$//;
176 $title =~ s/\s+/ /gs;
177 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
178 $title =~ s/^\s+//s; # in case title_sub introduced any...
179 $title = substr ($title, 0, 100);
180 $title =~ s/\s\S*$/.../;
181
182 my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
183
184 # add metadata per section...
185 foreach my $section (@sections) {
186 $section =~ s@^(\d+)></a>@@; # leftover from split expression...
187
188 $title = $1; # Greenstone does magic if sections are titled digits
189 if (! defined($title) ) {
190 print STDERR "no title: $section\n";
191 }
192 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
193 $newsection .= "<Metadata name=\"Title\">" . $title
194 . "</Metadata>\n--><p>\n";
195 $newsection .= $section;
196 $newsection .= "<!--</Section>-->\n";
197 $section = $newsection;
198 }
199
200 $$textref=join('', ($top_section, @sections));
201 }
202
203 my $outhandle = $self->{'outhandle'};
204 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
205 if $self->{'verbosity'} > 1;
206
207 return ConvertToPlug::process_type($self,"pdf",@_);
208}
209
2101;
Note: See TracBrowser for help on using the repository browser.