source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 7243

Last change on this file since 7243 was 7107, checked in by kjdon, 20 years ago

added a range to the zoom arg

  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PDFPlug;
27
28use ConvertToPlug;
29
30sub BEGIN {
31 @ISA = ('ConvertToPlug');
32}
33
34my $arguments =
35 [ { 'name' => "process_exp",
36 'desc' => "{BasPlug.process_exp}",
37 'type' => "regexp",
38 'deft' => &get_default_process_exp(),
39 'reqd' => "no" },
40 { 'name' => "block_exp",
41 'desc' => "{BasPlug.block_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_block_exp() },
44 { 'name' => "noimages",
45 'desc' => "{PDFPlug.noimages}",
46 'type' => "flag" },
47 { 'name' => "complex",
48 'desc' => "{PDFPlug.complex}",
49 'type' => "flag" },
50 { 'name' => "nohidden",
51 'desc' => "{PDFPlug.nohidden}",
52 'type' => "flag" },
53 { 'name' => "zoom",
54 'desc' => "{PDFPlug.zoom}",
55 'deft' => "2",
56 'range' => "1,3", # actually the range is 0.5-3
57 'type' => "int" },
58 { 'name' => "use_sections",
59 'desc' => "{PDFPlug.use_sections}",
60 'type' => "flag" } ];
61
62my $options = { 'name' => "PDFPlug",
63 'desc' => "{PDFPlug.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments };
67
68sub new {
69 my $class = shift (@_);
70
71 my ($noimages, $complex, $zoom, $use_sections, $nohidden);
72
73 my @args=@_;
74
75 if (!parsargv::parse(\@_,
76 q^noimages^, \$noimages,
77 q^complex^, \$complex,
78 q^zoom/\d+/2^, \$zoom,
79 q^nohidden^, \$nohidden,
80 q^use_sections^, \$use_sections,
81 "allow_extra_options")) {
82
83 local $self = new ConvertToPlug($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
84 my $outhandle=$self->{'outhandle'};
85 print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
86 $self->print_txt_usage(""); # Use default resource bundle
87 exit 1;
88 }
89
90
91 if ($use_sections) {
92 push (@args, "-description_tags");
93 }
94
95 # following title_sub removes "Page 1" added by pdftohtml, and a leading
96 # "1", which is often the page number at the top of the page. Bad Luck
97 # if your document title actually starts with "1 " - is there a better way?
98
99 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
100 $self->{'plugin_type'} = "PDFPlug";
101 if ($use_sections) {
102 $self->{'use_sections'}=1;
103 }
104
105 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
106 my $option_list = $self->{'option_list'};
107 push( @{$option_list}, $options );
108
109 # these are passed through to gsConvert.pl by ConvertToPlug.pm
110 $self->{'convert_options'} = "-pdf_zoom $zoom";
111 $self->{'convert_options'} .= " -pdf_complex" if $complex;
112 $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
113 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
114
115 # pdftohtml will always produce html files encoded as utf-8
116 if ($self->{'input_encoding'} eq "auto") {
117 $self->{'input_encoding'} = "utf8";
118 $self->{'extract_language'} = 1;
119 }
120
121 return bless $self, $class;
122}
123
124
125# sub print_usage {
126# print STDERR "\n usage: plugin PDFPlug [options]\n\n";
127# print STDERR " options:\n";
128# print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
129# print STDERR " -use_sections Create a separate section for each page\n";
130# print STDERR " of the PDF file.\n";
131# print STDERR " -noimages Don't attempt to extract images from PDF.\n";
132# print STDERR " -complex Create more complex output. With this option\n";
133# print STDERR " set the output html will look much more like\n";
134# print STDERR " the original PDF file. For this to function\n";
135# print STDERR " properly you Ghostscript installed (for *nix\n";
136# print STDERR " gs should be on your path while for windows\n";
137# print STDERR " you must have gswin32c.exe on your path).\n";
138# print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n";
139# print STDERR " hidden text. This is only useful if the -complex\n";
140# print STDERR " option is also set.";
141# print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
142# print STDERR " (this is only useful if -complex is set).\n\n";
143# }
144
145
146
147sub get_default_process_exp {
148 my $self = shift (@_);
149
150 return q^(?i)\.pdf$^;
151}
152
153# so we don't inherit HTMLPlug's block exp...
154sub get_default_block_exp {
155 return "";
156}
157
158
159# do plugin specific processing of doc_obj for HTML type
160sub process {
161 my $self = shift (@_);
162 my $outhandle=$self->{'outhandle'};
163
164 if ($self->{'use_sections'}
165 && $self->{'converted_to'} eq "HTML") {
166
167 print $outhandle "PDFPlug: Calculating sections...\n";
168 my $textref=$_[0];
169
170 # we have "<a name=1></a>" etc for each page
171 my @sections = split('<a name=', $$textref);
172
173 if (scalar (@sections) == 1) { #only one section - no split!
174 print $outhandle "PDFPlug: warning - no sections found\n";
175 } else {
176 shift @sections; # don't need HTML header, etc
177 }
178
179 # handle first section specially for title? Or all use first 100...
180
181 my $title = $sections[0];
182 $title =~ s/^\d+>//; # specific for pdftohtml...
183 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
184 $title =~ s/<[^>]*>/ /g;
185 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
186 $title =~ s/^\s+//s;
187 $title =~ s/\s+$//;
188 $title =~ s/\s+/ /gs;
189 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
190 $title =~ s/^\s+//s; # in case title_sub introduced any...
191 $title = substr ($title, 0, 100);
192 $title =~ s/\s\S*$/.../;
193
194 my $top_section;
195 if (scalar (@sections) == 1) { # no sections found
196 $top_section=$sections[0];
197 @sections=();
198 } else {
199 $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
200 }
201
202 # add metadata per section...
203 foreach my $section (@sections) {
204 $section =~ s@^(\d+)></a>@@; # leftover from split expression...
205
206 $title = $1; # Greenstone does magic if sections are titled digits
207 if (! defined($title) ) {
208 print STDERR "no title: $section\n";
209 }
210 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
211 $newsection .= "<Metadata name=\"Title\">" . $title
212 . "</Metadata>\n--><p>\n";
213 $newsection .= $section;
214 $newsection .= "<!--</Section>-->\n";
215 $section = $newsection;
216 }
217
218 $$textref=join('', ($top_section, @sections));
219 }
220
221 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
222 if $self->{'verbosity'} > 1;
223
224 return ConvertToPlug::process_type($self,"pdf",@_);
225}
226
2271;
Note: See TracBrowser for help on using the repository browser.