source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 7243

Last change on this file since 7243 was 7107, checked in by kjdon, 20 years ago

added a range to the zoom arg

  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
RevLine 
[1410]1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[2661]8# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PDFPlug;
27
28use ConvertToPlug;
29
30sub BEGIN {
31 @ISA = ('ConvertToPlug');
32}
33
[4744]34my $arguments =
35 [ { 'name' => "process_exp",
[4873]36 'desc' => "{BasPlug.process_exp}",
[6408]37 'type' => "regexp",
[4744]38 'deft' => &get_default_process_exp(),
39 'reqd' => "no" },
40 { 'name' => "block_exp",
[4873]41 'desc' => "{BasPlug.block_exp}",
[6408]42 'type' => "regexp",
[4873]43 'deft' => &get_default_block_exp() },
[4744]44 { 'name' => "noimages",
[4873]45 'desc' => "{PDFPlug.noimages}",
[4744]46 'type' => "flag" },
47 { 'name' => "complex",
[4873]48 'desc' => "{PDFPlug.complex}",
[4744]49 'type' => "flag" },
50 { 'name' => "nohidden",
[4873]51 'desc' => "{PDFPlug.nohidden}",
[4744]52 'type' => "flag" },
53 { 'name' => "zoom",
[4873]54 'desc' => "{PDFPlug.zoom}",
[4744]55 'deft' => "2",
[7107]56 'range' => "1,3", # actually the range is 0.5-3
[4744]57 'type' => "int" },
58 { 'name' => "use_sections",
[4873]59 'desc' => "{PDFPlug.use_sections}",
[4744]60 'type' => "flag" } ];
[3540]61
62my $options = { 'name' => "PDFPlug",
[5680]63 'desc' => "{PDFPlug.desc}",
[6408]64 'abstract' => "no",
[3540]65 'inherits' => "yes",
66 'args' => $arguments };
67
[1410]68sub new {
69 my $class = shift (@_);
[2452]70
[4103]71 my ($noimages, $complex, $zoom, $use_sections, $nohidden);
[5616]72
73 my @args=@_;
74
[3720]75 if (!parsargv::parse(\@_,
76 q^noimages^, \$noimages,
77 q^complex^, \$complex,
78 q^zoom/\d+/2^, \$zoom,
[4103]79 q^nohidden^, \$nohidden,
[5139]80 q^use_sections^, \$use_sections,
[3720]81 "allow_extra_options")) {
82
[4785]83 local $self = new ConvertToPlug($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[7019]84 my $outhandle=$self->{'outhandle'};
85 print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
[4873]86 $self->print_txt_usage(""); # Use default resource bundle
[7019]87 exit 1;
[3720]88 }
89
[3833]90
91 if ($use_sections) {
92 push (@args, "-description_tags");
93 }
94
[2452]95 # following title_sub removes "Page 1" added by pdftohtml, and a leading
96 # "1", which is often the page number at the top of the page. Bad Luck
97 # if your document title actually starts with "1 " - is there a better way?
98
[3411]99 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[5924]100 $self->{'plugin_type'} = "PDFPlug";
[3411]101 if ($use_sections) {
102 $self->{'use_sections'}=1;
103 }
[3833]104
[3540]105 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
106 my $option_list = $self->{'option_list'};
107 push( @{$option_list}, $options );
[3720]108
109 # these are passed through to gsConvert.pl by ConvertToPlug.pm
110 $self->{'convert_options'} = "-pdf_zoom $zoom";
111 $self->{'convert_options'} .= " -pdf_complex" if $complex;
[4103]112 $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
[3720]113 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
114
115 # pdftohtml will always produce html files encoded as utf-8
116 if ($self->{'input_encoding'} eq "auto") {
117 $self->{'input_encoding'} = "utf8";
118 $self->{'extract_language'} = 1;
119 }
[5616]120
[1410]121 return bless $self, $class;
122}
123
124
[4785]125# sub print_usage {
126# print STDERR "\n usage: plugin PDFPlug [options]\n\n";
127# print STDERR " options:\n";
128# print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
129# print STDERR " -use_sections Create a separate section for each page\n";
130# print STDERR " of the PDF file.\n";
131# print STDERR " -noimages Don't attempt to extract images from PDF.\n";
132# print STDERR " -complex Create more complex output. With this option\n";
133# print STDERR " set the output html will look much more like\n";
134# print STDERR " the original PDF file. For this to function\n";
135# print STDERR " properly you Ghostscript installed (for *nix\n";
136# print STDERR " gs should be on your path while for windows\n";
137# print STDERR " you must have gswin32c.exe on your path).\n";
138# print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n";
139# print STDERR " hidden text. This is only useful if the -complex\n";
140# print STDERR " option is also set.";
141# print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
142# print STDERR " (this is only useful if -complex is set).\n\n";
143# }
[1410]144
[3411]145
146
[1410]147sub get_default_process_exp {
148 my $self = shift (@_);
149
150 return q^(?i)\.pdf$^;
151}
[2661]152
153# so we don't inherit HTMLPlug's block exp...
154sub get_default_block_exp {
155 return "";
156}
[1410]157
158
159# do plugin specific processing of doc_obj for HTML type
160sub process {
161 my $self = shift (@_);
[7019]162 my $outhandle=$self->{'outhandle'};
163
[3411]164 if ($self->{'use_sections'}
165 && $self->{'converted_to'} eq "HTML") {
166
[7019]167 print $outhandle "PDFPlug: Calculating sections...\n";
[3411]168 my $textref=$_[0];
169
[3614]170 # we have "<a name=1></a>" etc for each page
171 my @sections = split('<a name=', $$textref);
[3411]172
[7019]173 if (scalar (@sections) == 1) { #only one section - no split!
174 print $outhandle "PDFPlug: warning - no sections found\n";
175 } else {
176 shift @sections; # don't need HTML header, etc
177 }
178
[3411]179 # handle first section specially for title? Or all use first 100...
180
181 my $title = $sections[0];
182 $title =~ s/^\d+>//; # specific for pdftohtml...
183 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
184 $title =~ s/<[^>]*>/ /g;
185 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
186 $title =~ s/^\s+//s;
187 $title =~ s/\s+$//;
188 $title =~ s/\s+/ /gs;
189 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
190 $title =~ s/^\s+//s; # in case title_sub introduced any...
191 $title = substr ($title, 0, 100);
192 $title =~ s/\s\S*$/.../;
193
[7019]194 my $top_section;
195 if (scalar (@sections) == 1) { # no sections found
196 $top_section=$sections[0];
197 @sections=();
198 } else {
199 $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
200 }
[3411]201
202 # add metadata per section...
203 foreach my $section (@sections) {
[3614]204 $section =~ s@^(\d+)></a>@@; # leftover from split expression...
205
[3411]206 $title = $1; # Greenstone does magic if sections are titled digits
207 if (! defined($title) ) {
208 print STDERR "no title: $section\n";
209 }
210 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
211 $newsection .= "<Metadata name=\"Title\">" . $title
212 . "</Metadata>\n--><p>\n";
213 $newsection .= $section;
214 $newsection .= "<!--</Section>-->\n";
215 $section = $newsection;
216 }
217
218 $$textref=join('', ($top_section, @sections));
219 }
220
[2979]221 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
[1410]222 if $self->{'verbosity'} > 1;
223
224 return ConvertToPlug::process_type($self,"pdf",@_);
225}
226
2271;
Note: See TracBrowser for help on using the repository browser.