1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # PDFPlug.pm -- reasonably with-it pdf plugin
|
---|
4 | # A component of the Greenstone digital library software
|
---|
5 | # from the New Zealand Digital Library Project at the
|
---|
6 | # University of Waikato, New Zealand.
|
---|
7 | #
|
---|
8 | # Copyright (C) 1999-2001 New Zealand Digital Library Project
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | #
|
---|
24 | ###########################################################################
|
---|
25 |
|
---|
26 | package PDFPlug;
|
---|
27 |
|
---|
28 | use ConvertToPlug;
|
---|
29 |
|
---|
30 | sub BEGIN {
|
---|
31 | @ISA = ('ConvertToPlug');
|
---|
32 | }
|
---|
33 |
|
---|
34 | my $arguments = [ { 'name' => "process_exp",
|
---|
35 | 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
|
---|
36 | 'type' => "string",
|
---|
37 | 'deft' => q^(?i)\.pdf$^,
|
---|
38 | 'reqd' => "no" },
|
---|
39 | { 'name' => "block_exp",
|
---|
40 | 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
|
---|
41 | 'type' => 'string',
|
---|
42 | 'deft' => q^^ },
|
---|
43 | { 'name' => "noimages",
|
---|
44 | 'desc' => "Don't attempt to extract images from PDF.",
|
---|
45 | 'type' => "flag" },
|
---|
46 | { 'name' => "complex",
|
---|
47 | 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
|
---|
48 | 'type' => "flag" },
|
---|
49 | { 'name' => "zoom",
|
---|
50 | 'desc' => "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
|
---|
51 | 'type' => "int" }
|
---|
52 | ];
|
---|
53 |
|
---|
54 | my $options = { 'name' => "PDFPlug",
|
---|
55 | 'desc' => "Reasonably with-it pdf plugin.",
|
---|
56 | 'inherits' => "yes",
|
---|
57 | 'args' => $arguments };
|
---|
58 |
|
---|
59 | sub new {
|
---|
60 | my $class = shift (@_);
|
---|
61 |
|
---|
62 | my ($noimages, $complex, $zoom, $use_sections);
|
---|
63 |
|
---|
64 | if (!parsargv::parse(\@_,
|
---|
65 | q^noimages^, \$noimages,
|
---|
66 | q^complex^, \$complex,
|
---|
67 | q^zoom/\d+/2^, \$zoom,
|
---|
68 | q^use_sections/1?/^, \$use_sections,
|
---|
69 | "allow_extra_options")) {
|
---|
70 |
|
---|
71 | print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
|
---|
72 | &print_usage();
|
---|
73 | die "\n";
|
---|
74 | }
|
---|
75 |
|
---|
76 |
|
---|
77 | my @args=@_;
|
---|
78 | if ($use_sections) {
|
---|
79 | push (@args, "-description_tags");
|
---|
80 | }
|
---|
81 |
|
---|
82 | # following title_sub removes "Page 1" added by pdftohtml, and a leading
|
---|
83 | # "1", which is often the page number at the top of the page. Bad Luck
|
---|
84 | # if your document title actually starts with "1 " - is there a better way?
|
---|
85 |
|
---|
86 | my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
|
---|
87 |
|
---|
88 | if ($use_sections) {
|
---|
89 | $self->{'use_sections'}=1;
|
---|
90 | }
|
---|
91 |
|
---|
92 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
93 | my $option_list = $self->{'option_list'};
|
---|
94 | push( @{$option_list}, $options );
|
---|
95 |
|
---|
96 | # these are passed through to gsConvert.pl by ConvertToPlug.pm
|
---|
97 | $self->{'convert_options'} = "-pdf_zoom $zoom";
|
---|
98 | $self->{'convert_options'} .= " -pdf_complex" if $complex;
|
---|
99 | $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
|
---|
100 |
|
---|
101 | # pdftohtml will always produce html files encoded as utf-8
|
---|
102 | if ($self->{'input_encoding'} eq "auto") {
|
---|
103 | $self->{'input_encoding'} = "utf8";
|
---|
104 | $self->{'extract_language'} = 1;
|
---|
105 | }
|
---|
106 |
|
---|
107 | return bless $self, $class;
|
---|
108 | }
|
---|
109 |
|
---|
110 |
|
---|
111 | sub print_usage {
|
---|
112 | print STDERR "\n usage: plugin PDFPlug [options]\n\n";
|
---|
113 | print STDERR " options:\n";
|
---|
114 | print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
|
---|
115 | print STDERR " -use_sections Create a separate section for each page\n";
|
---|
116 | print STDERR " of the PDF file.\n";
|
---|
117 | print STDERR " -noimages Don't attempt to extract images from PDF.\n";
|
---|
118 | print STDERR " -complex Create more complex output. With this option\n";
|
---|
119 | print STDERR " set the output html will look much more like\n";
|
---|
120 | print STDERR " the original PDF file. For this to function\n";
|
---|
121 | print STDERR " properly you Ghostscript installed (for *nix\n";
|
---|
122 | print STDERR " gs should be on your path while for windows\n";
|
---|
123 | print STDERR " you must have gswin32c.exe on your path).\n";
|
---|
124 | print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
|
---|
125 | print STDERR " (this is only useful if -complex is set).\n\n";
|
---|
126 | }
|
---|
127 |
|
---|
128 |
|
---|
129 |
|
---|
130 | sub get_default_process_exp {
|
---|
131 | my $self = shift (@_);
|
---|
132 |
|
---|
133 | return q^(?i)\.pdf$^;
|
---|
134 | }
|
---|
135 |
|
---|
136 | # so we don't inherit HTMLPlug's block exp...
|
---|
137 | sub get_default_block_exp {
|
---|
138 | return "";
|
---|
139 | }
|
---|
140 |
|
---|
141 |
|
---|
142 | # do plugin specific processing of doc_obj for HTML type
|
---|
143 | sub process {
|
---|
144 | my $self = shift (@_);
|
---|
145 | if ($self->{'use_sections'}
|
---|
146 | && $self->{'converted_to'} eq "HTML") {
|
---|
147 |
|
---|
148 | print STDERR "PDFPlug: Calculating sections...\n";
|
---|
149 | my $textref=$_[0];
|
---|
150 |
|
---|
151 | # we have "<a name=1></a>" etc for each page
|
---|
152 | my @sections = split('<a name=', $$textref);
|
---|
153 |
|
---|
154 | shift @sections; # don't need HTML header, etc
|
---|
155 | # handle first section specially for title? Or all use first 100...
|
---|
156 |
|
---|
157 | my $title = $sections[0];
|
---|
158 | $title =~ s/^\d+>//; # specific for pdftohtml...
|
---|
159 | $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
|
---|
160 | $title =~ s/<[^>]*>/ /g;
|
---|
161 | $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp...
|
---|
162 | $title =~ s/^\s+//s;
|
---|
163 | $title =~ s/\s+$//;
|
---|
164 | $title =~ s/\s+/ /gs;
|
---|
165 | $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
|
---|
166 | $title =~ s/^\s+//s; # in case title_sub introduced any...
|
---|
167 | $title = substr ($title, 0, 100);
|
---|
168 | $title =~ s/\s\S*$/.../;
|
---|
169 |
|
---|
170 | my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
|
---|
171 |
|
---|
172 | # add metadata per section...
|
---|
173 | foreach my $section (@sections) {
|
---|
174 | $section =~ s@^(\d+)></a>@@; # leftover from split expression...
|
---|
175 |
|
---|
176 | $title = $1; # Greenstone does magic if sections are titled digits
|
---|
177 | if (! defined($title) ) {
|
---|
178 | print STDERR "no title: $section\n";
|
---|
179 | }
|
---|
180 | my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
|
---|
181 | $newsection .= "<Metadata name=\"Title\">" . $title
|
---|
182 | . "</Metadata>\n--><p>\n";
|
---|
183 | $newsection .= $section;
|
---|
184 | $newsection .= "<!--</Section>-->\n";
|
---|
185 | $section = $newsection;
|
---|
186 | }
|
---|
187 |
|
---|
188 | $$textref=join('', ($top_section, @sections));
|
---|
189 | }
|
---|
190 |
|
---|
191 | my $outhandle = $self->{'outhandle'};
|
---|
192 | print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
|
---|
193 | if $self->{'verbosity'} > 1;
|
---|
194 |
|
---|
195 | return ConvertToPlug::process_type($self,"pdf",@_);
|
---|
196 | }
|
---|
197 |
|
---|
198 | 1;
|
---|