source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 22702

Last change on this file since 22702 was 22702, checked in by davidb, 14 years ago

Introduction of new plugin AutoloadConverterScripting to replace duplicated code that turns up in plugins like OOConvertBinaryFile and PBConvertBinaryFile

  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1###########################################################################
2#
3# PDFPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package PDFPlugin;
26
27use ReadTextFile;
28use unicode;
29use strict;
30no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
31
32sub BEGIN {
33 $::autoloadConverterScripting="PDFBoxConverter";
34 require AutoloadConverterScripting;
35
36 @PDFPlugin::ISA = ('AutoloadConverterScripting', 'ReadTextFile');
37}
38
39my $convert_to_list =
40 [ { 'name' => "auto",
41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42 { 'name' => "html",
43 'desc' => "{ConvertBinaryFile.convert_to.html}" },
44 { 'name' => "text",
45 'desc' => "{ConvertBinaryFile.convert_to.text}" },
46 { 'name' => "pagedimg_jpg",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
48 { 'name' => "pagedimg_gif",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
50 { 'name' => "pagedimg_png",
51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
52 ];
53
54
55my $arguments =
56 [
57 { 'name' => "convert_to",
58 'desc' => "{ConvertBinaryFile.convert_to}",
59 'type' => "enum",
60 'reqd' => "yes",
61 'list' => $convert_to_list,
62 'deft' => "html" },
63 { 'name' => "process_exp",
64 'desc' => "{BasePlugin.process_exp}",
65 'type' => "regexp",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "block_exp",
69 'desc' => "{BasePlugin.block_exp}",
70 'type' => "regexp",
71 'deft' => &get_default_block_exp() },
72 { 'name' => "metadata_fields",
73 'desc' => "{HTMLPlugin.metadata_fields}",
74 'type' => "string",
75 'deft' => "" },
76 { 'name' => "metadata_field_separator",
77 'desc' => "{HTMLPlugin.metadata_field_separator}",
78 'type' => "string",
79 'deft' => "" },
80 { 'name' => "noimages",
81 'desc' => "{PDFPlugin.noimages}",
82 'type' => "flag" },
83 { 'name' => "allowimagesonly",
84 'desc' => "{PDFPlugin.allowimagesonly}",
85 'type' => "flag" },
86 { 'name' => "complex",
87 'desc' => "{PDFPlugin.complex}",
88 'type' => "flag" },
89 { 'name' => "nohidden",
90 'desc' => "{PDFPlugin.nohidden}",
91 'type' => "flag" },
92 { 'name' => "zoom",
93 'desc' => "{PDFPlugin.zoom}",
94 'deft' => "2",
95 'range' => "1,3", # actually the range is 0.5-3
96 'type' => "int" },
97 { 'name' => "use_sections",
98 'desc' => "{PDFPlugin.use_sections}",
99 'type' => "flag" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" }
103 ];
104
105my $options = { 'name' => "PDFPlugin",
106 'desc' => "{PDFPlugin.desc}",
107 'abstract' => "no",
108 'inherits' => "yes",
109 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
110 'args' => $arguments };
111
112sub new {
113 my ($class) = shift (@_);
114 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
115 push(@$pluginlist, $class);
116
117 push(@$inputargs,"-title_sub");
118 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
119
120 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
121 push(@{$hashArgOptLists->{"OptList"}},$options);
122
123 my $self = new AutoloadConverterScripting("PDFBoxConverter",$pluginlist, $inputargs, $hashArgOptLists);
124
125 if ($self->{'info_only'}) {
126 # don't worry about any options etc
127 return bless $self, $class;
128 }
129
130 $self->{'filename_extension'} = "pdf";
131 $self->{'file_type'} = "PDF";
132
133 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
134 my $zoom = $self->{"zoom"};
135 $self->{'convert_options'} = "-pdf_zoom $zoom";
136 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
137 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
138 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
139 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
140
141 # check convert_to
142 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
143 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
144 $self->{'convert_to'} = "html";
145 }
146 elsif ($self->{'convert_to'} eq "auto") {
147 # choose html ?? is this the best option
148 $self->{'convert_to'} = "html";
149 }
150 # set convert_to_plugin and convert_to_ext
151 $self->set_standard_convert_settings();
152
153 my $secondary_plugin_name = $self->{'convert_to_plugin'};
154 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
155
156 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
157 $secondary_plugin_options->{$secondary_plugin_name} = [];
158 }
159 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
160
161 # following title_sub removes "Page 1" added by pdftohtml, and a leading
162 # "1", which is often the page number at the top of the page. Bad Luck
163 # if your document title actually starts with "1 " - is there a better way?
164 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
165 my $associate_tail_re = $self->{'associate_tail_re'};
166 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
167 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
168 }
169 push(@$specific_options, "-file_rename_method", "none");
170
171 if ($secondary_plugin_name eq "HTMLPlugin") {
172 # pdftohtml always produces utf8
173 push(@$specific_options, "-input_encoding", "utf8");
174 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
175 push(@$specific_options, "-processing_tmp_files");
176 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
177 # to extract these metadata fields from the HEAD META fields
178 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
179 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
180 } else {
181 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
182 }
183 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
184 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
185 }
186 if ($self->{'use_sections'} || $self->{'description_tags'}) {
187 $self->{'description_tags'} = 1;
188 push(@$specific_options, "-description_tags");
189 }
190 }
191 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
192 push(@$specific_options, "-screenviewsize", "1000");
193 push(@$specific_options, "-enable_cache");
194 push(@$specific_options, "-processing_tmp_files");
195 }
196
197 $self = bless $self, $class;
198 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
199 return $self;
200}
201
202sub get_default_process_exp {
203 my $self = shift (@_);
204
205 return q^(?i)\.pdf$^;
206}
207
208# so we don't inherit HTMLPlug's block exp...
209sub get_default_block_exp {
210 return "";
211}
212
213sub convert_post_process
214{
215 my $self = shift (@_);
216 my ($conv_filename) = @_;
217
218 my $outhandle=$self->{'outhandle'};
219
220 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
221 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
222
223 # read in file ($text will be in utf8)
224 my $text = "";
225 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
226 $self->read_file ($conv_filename, "utf8", "", \$text);
227
228 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
229 # for each page). Metadata based on this calculation not set until process()
230 #
231 # Note: this is done even if we are not breaking to document into pages as it might
232 # be useful to give an indication of document length in browser through setting
233 # num_pages as metadata.
234 my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
235 my $num_pages = scalar(@pages);
236 $self->{'num_pages'} = $num_pages;
237
238 if ($self->{'use_sections'}
239 && $self->{'converted_to'} eq "HTML") {
240
241 print $outhandle "PDFPlugin: Calculating sections...\n";
242
243 # we have "<a name=1></a>" etc for each page
244 # it may be <A name=
245 my @sections = split('<[Aa] name=', $text);
246
247 my $top_section = "";
248
249 if (scalar (@sections) == 1) { #only one section - no split!
250 print $outhandle "PDFPlugin: warning - no sections found\n";
251 } else {
252 $top_section .= shift @sections; # keep HTML header etc as top_section
253 }
254
255 # handle first section specially for title? Or all use first 100...
256
257 my $title = $sections[0];
258 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
259 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
260 $title =~ s/<[^>]*>/ /g;
261 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
262 $title =~ s/^\s+//s;
263 $title =~ s/\s+$//;
264 $title =~ s/\s+/ /gs;
265 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
266 $title =~ s/^\s+//s; # in case title_sub introduced any...
267 $title = substr ($title, 0, 100);
268 $title =~ s/\s\S*$/.../;
269
270
271 if (scalar (@sections) == 1) { # no sections found
272 $top_section .= $sections[0];
273 @sections=();
274 } else {
275 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
276 }
277
278 # add metadata per section...
279 foreach my $section (@sections) {
280 # section names are not always just digits, may be like "outline"
281 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
282
283 $title = $1; # Greenstone does magic if sections are titled digits
284 if (! defined($title) ) {
285 print STDERR "no title: $section\n";
286 $title = " "; # get rid of the undefined warning in next line
287 }
288 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
289 $newsection .= "<Metadata name=\"Title\">" . $title
290 . "</Metadata>\n--><p>\n";
291 $newsection .= $section;
292 $newsection .= "<!--</Section>-->\n";
293 $section = $newsection;
294 }
295
296 $text=join('', ($top_section, @sections));
297 }
298
299 # turn any high bytes that aren't valid utf-8 into utf-8.
300 unicode::ensure_utf8(\$text);
301
302 # Write it out again!
303 $self->utf8_write_file (\$text, $conv_filename);
304}
305
306
307# do plugin specific processing of doc_obj for HTML type
308sub process {
309 my $self = shift (@_);
310 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
311
312 my $result = $self->process_type($base_dir,$file,$doc_obj);
313
314 # fix up the extracted date metadata to be in Greenstone date format,
315 # and fix the capitalisation of 'date'
316 my $cursection = $doc_obj->get_top_section();
317 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
318 $doc_obj->delete_metadata($cursection, "date", $datemeta);
319
320 # We're just interested in the date bit, not the time
321 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
322 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
323 # extracts the ModDate, so it is 0...
324 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
325 my ($year, $month, $day) = ($1,$2,$3);
326 if (defined($year) && defined($month) && defined($day)) {
327 if ($year == 0) {next}
328 if ($year < 100) {$year += 1900} # just to be safe
329 if ($month =~ /^\d$/) {$month="0$month"} # single digit
330 if ($day =~ /^\d$/) {$day="0$day"} # single digit
331 my $date="$year$month$day";
332 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
333 }
334 }
335
336 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
337
338 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
339 # we explicitly make it a paged document, cos greenstone won't get it
340 # right if any section has an empty title, or one with letters in it
341 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
342 }
343
344 return $result;
345}
346
3471;
Note: See TracBrowser for help on using the repository browser.