source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1###########################################################################
2#
3# PDFPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package PDFPlugin;
26
27use strict;
28no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29
30use ReadTextFile;
31use unicode;
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
37
38
39my $convert_to_list =
40 [ { 'name' => "auto",
41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42 { 'name' => "html",
43 'desc' => "{ConvertBinaryFile.convert_to.html}" },
44 { 'name' => "text",
45 'desc' => "{ConvertBinaryFile.convert_to.text}" },
46 { 'name' => "pagedimg_jpg",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
48 { 'name' => "pagedimg_gif",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
50 { 'name' => "pagedimg_png",
51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
52 ];
53
54
55my $arguments =
56 [
57 { 'name' => "convert_to",
58 'desc' => "{ConvertBinaryFile.convert_to}",
59 'type' => "enum",
60 'reqd' => "yes",
61 'list' => $convert_to_list,
62 'deft' => "html" },
63 { 'name' => "process_exp",
64 'desc' => "{BaseImporter.process_exp}",
65 'type' => "regexp",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "block_exp",
69 'desc' => "{BaseImporter.block_exp}",
70 'type' => "regexp",
71 'deft' => &get_default_block_exp() },
72 { 'name' => "metadata_fields",
73 'desc' => "{HTMLPlugin.metadata_fields}",
74 'type' => "string",
75 'deft' => "Title,Author,Subject,Keywords" },
76 { 'name' => "metadata_field_separator",
77 'desc' => "{HTMLPlugin.metadata_field_separator}",
78 'type' => "string",
79 'deft' => "" },
80 { 'name' => "noimages",
81 'desc' => "{PDFPlugin.noimages}",
82 'type' => "flag" },
83 { 'name' => "allowimagesonly",
84 'desc' => "{PDFPlugin.allowimagesonly}",
85 'type' => "flag" },
86 { 'name' => "complex",
87 'desc' => "{PDFPlugin.complex}",
88 'type' => "flag" },
89 { 'name' => "nohidden",
90 'desc' => "{PDFPlugin.nohidden}",
91 'type' => "flag" },
92 { 'name' => "zoom",
93 'desc' => "{PDFPlugin.zoom}",
94 'deft' => "2",
95 'range' => "1,3", # actually the range is 0.5-3
96 'type' => "int" },
97 { 'name' => "use_sections",
98 'desc' => "{PDFPlugin.use_sections}",
99 'type' => "flag" },
100 { 'name' => "description_tags",
101 'desc' => "{HTMLPlugin.description_tags}",
102 'type' => "flag" },
103 { 'name' => "use_realistic_book",
104 'desc' => "{PDFPlugin.use_realistic_book}",
105 'type' => "flag"}
106 ];
107
108my $options = { 'name' => "PDFPlugin",
109 'desc' => "{PDFPlugin.desc}",
110 'abstract' => "no",
111 'inherits' => "yes",
112 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
113 'args' => $arguments };
114
115sub new {
116 my ($class) = shift (@_);
117 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
118 push(@$pluginlist, $class);
119
120 push(@$inputargs,"-title_sub");
121 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
122
123 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
124 push(@{$hashArgOptLists->{"OptList"}},$options);
125
126 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
127 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
128 my $self = BaseImporter::merge_inheritance($auto_converter_self, $cbf_self);
129
130 if ($self->{'info_only'}) {
131 # don't worry about any options etc
132 return bless $self, $class;
133 }
134
135 $self = bless $self, $class;
136 $self->{'file_type'} = "PDF";
137
138 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
139 my $zoom = $self->{"zoom"};
140 $self->{'convert_options'} = "-pdf_zoom $zoom";
141 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
142 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
143 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
144 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
145
146 # check convert_to
147 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
148 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
149 $self->{'convert_to'} = "html";
150 }
151 elsif ($self->{'convert_to'} eq "auto") {
152 # choose html ?? is this the best option
153 $self->{'convert_to'} = "html";
154 }
155 if ($self->{'use_realistic_book'}) {
156 if ($self->{'convert_to'} ne "html") {
157 print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
158 $self->{'convert_to'} = "html";
159 }
160 }
161 # set convert_to_plugin and convert_to_ext
162 $self->set_standard_convert_settings();
163
164 my $secondary_plugin_name = $self->{'convert_to_plugin'};
165 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
166
167 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
168 $secondary_plugin_options->{$secondary_plugin_name} = [];
169 }
170 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
171
172 # following title_sub removes "Page 1" added by pdftohtml, and a leading
173 # "1", which is often the page number at the top of the page. Bad Luck
174 # if your document title actually starts with "1 " - is there a better way?
175 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
176 my $associate_tail_re = $self->{'associate_tail_re'};
177 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
178 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
179 }
180 push(@$specific_options, "-file_rename_method", "none");
181
182 if ($secondary_plugin_name eq "HTMLPlugin") {
183 # pdftohtml always produces utf8 - What about pdfbox???
184 # push(@$specific_options, "-input_encoding", "utf8");
185 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
186 push(@$specific_options, "-processing_tmp_files");
187 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
188 # to extract these metadata fields from the HEAD META fields
189 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
190 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
191 } else {
192 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
193 }
194 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
195 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
196 }
197 if ($self->{'use_sections'} || $self->{'description_tags'}) {
198 $self->{'description_tags'} = 1;
199 push(@$specific_options, "-description_tags");
200 }
201 if ($self->{'use_realistic_book'}) {
202 push(@$specific_options, "-use_realistic_book");
203 }
204 }
205 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
206 push(@$specific_options, "-screenviewsize", "1000");
207 push(@$specific_options, "-enable_cache");
208 push(@$specific_options, "-processing_tmp_files");
209 }
210
211 $self = bless $self, $class;
212 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
213 return $self;
214}
215
216sub get_default_process_exp {
217 my $self = shift (@_);
218
219 return q^(?i)\.pdf$^;
220}
221
222# so we don't inherit HTMLPlug's block exp...
223sub get_default_block_exp {
224 return "";
225}
226
227sub init {
228 my $self = shift (@_);
229
230 # ConvertBinaryFile init
231 $self->SUPER::init(@_);
232 $self->AutoLoadConverters::init(@_);
233
234}
235
236sub begin {
237 my $self = shift (@_);
238
239 $self->AutoLoadConverters::begin(@_);
240 $self->SUPER::begin(@_);
241
242}
243
244sub deinit {
245 my $self = shift (@_);
246
247 $self->AutoLoadConverters::deinit(@_);
248 $self->SUPER::deinit(@_);
249
250}
251
252# By setting hashing to be on ga xml this ensures that two
253# PDF files that are identical except for the metadata
254# to hash to different values. Without this, when each PDF
255# file is converted to HTML there is a chance that they
256# will both be *identical* if the conversion utility does
257# not embed the metadata in the generated HTML. This is
258# certainly the case when PDFBOX is being used.
259
260# This change makes this convert to based plugin more
261# consistent with the original vision that the same document
262# with different metadata should
263# be seen as different.
264
265sub get_oid_hash_type {
266 my $self = shift (@_);
267 return "hash_on_ga_xml";
268}
269
270
271sub tmp_area_convert_file {
272
273 my $self = shift (@_);
274 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
275
276}
277
278sub convert_post_process
279{
280 my $self = shift (@_);
281 my ($conv_filename) = @_;
282
283 my $outhandle=$self->{'outhandle'};
284
285 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
286 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
287
288 # read in file ($text will be in utf8)
289 my $text = "";
290 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
291 $self->read_file ($conv_filename, "utf8", "", \$text);
292
293 # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
294 # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
295 # which it then splits on to generate page-based sections. However, that's not what PDFBox
296 # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
297 # embeds each page in an extra div. The div opener is:
298 # <div style=\"page-break-before:always; page-break-after:always\">
299 # The PDFPlugin now looks for this and prefixes <a name=0></a> to each such div. (The
300 # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
301 # a regex substitution even with regex extensions on.) Later, when we process each section
302 # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
303 # that increments the pagenum for each subsequent section.
304
305 #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
306 my $loopcounter = 0; # used later on!
307 $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
308
309
310 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
311 # for each page). Metadata based on this calculation not set until process()
312 #
313 # Note: this is done even if we are not breaking the document into pages as it might
314 # be useful to give an indication of document length in browser through setting
315 # num_pages as metadata.
316 # Clean html from low and hight surrogates D800–DFFF
317 $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
318 my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
319 my $num_pages = scalar(@pages);
320 $self->{'num_pages'} = $num_pages;
321
322 if ($self->{'use_sections'}
323 && $self->{'converted_to'} eq "HTML") {
324
325 print $outhandle "PDFPlugin: Calculating sections...\n";
326
327 # we have "<a name=1></a>" etc for each page
328 # it may be <A name=
329 my @sections = split('<[Aa] name=', $text);
330
331 my $top_section = "";
332
333 if (scalar (@sections) == 1) { #only one section - no split!
334 print $outhandle "PDFPlugin: warning - no sections found\n";
335 } else {
336 $top_section .= shift @sections; # keep HTML header etc as top_section
337 }
338
339 # handle first section specially for title? Or all use first 100...
340
341 my $title = $sections[0];
342 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
343 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
344 $title =~ s/<[^>]*>/ /g;
345 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
346 $title =~ s/^\s+//s;
347 $title =~ s/\s+$//;
348 $title =~ s/\s+/ /gs;
349 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
350 $title =~ s/^\s+//s; # in case title_sub introduced any...
351 $title = substr ($title, 0, 100);
352 $title =~ s/\s\S*$/.../;
353
354
355 if (scalar (@sections) == 1) { # no sections found
356 $top_section .= $sections[0];
357 @sections=();
358 } else {
359 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
360 }
361
362 # add metadata per section...
363 foreach my $section (@sections) {
364 # section names are not always just digits, may be like "outline"
365 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
366
367 $title = $1; # Greenstone does magic if sections are titled digits
368
369 # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
370 # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
371 # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
372 if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
373 $title = ++$loopcounter;
374 }
375
376 if (! defined($title) ) {
377 print STDERR "no title: $section\n";
378 $title = " "; # get rid of the undefined warning in next line
379 }
380 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
381 $newsection .= "<Metadata name=\"Title\">" . $title
382 . "</Metadata>\n--><br />\n";
383 $newsection .= $section;
384 $newsection .= "<!--</Section>-->\n";
385 $section = $newsection;
386 }
387
388 $text=join('', ($top_section, @sections));
389 }
390
391 if ($self->{'use_sections'}
392 && $self->{'converted_to'} eq "text") {
393 print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
394 }
395
396
397 # The following should no longer be needed, now that strings
398 # read in are Unicode aware (in the Perl sense) rather than
399 # raw binary strings that just happen to be UTF-8 compliant
400
401 # turn any high bytes that aren't valid utf-8 into utf-8.
402## unicode::ensure_utf8(\$text);
403
404 # Write it out again!
405 $self->utf8_write_file (\$text, $conv_filename);
406}
407
408
409# do plugin specific processing of doc_obj for HTML type
410sub process {
411 my $self = shift (@_);
412 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
413
414 my $result = $self->process_type($base_dir,$file,$doc_obj);
415
416 # fix up the extracted date metadata to be in Greenstone date format,
417 # and fix the capitalisation of 'date'
418 my $cursection = $doc_obj->get_top_section();
419 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
420 $doc_obj->delete_metadata($cursection, "date", $datemeta);
421
422 # We're just interested in the date bit, not the time
423 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
424 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
425 # extracts the ModDate, so it is 0...
426 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
427 my ($year, $month, $day) = ($1,$2,$3);
428 if (defined($year) && defined($month) && defined($day)) {
429 if ($year == 0) {next}
430 if ($year < 100) {$year += 1900} # just to be safe
431 if ($month =~ /^\d$/) {$month="0$month"} # single digit
432 if ($day =~ /^\d$/) {$day="0$day"} # single digit
433 my $date="$year$month$day";
434 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
435 }
436 }
437
438 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
439
440 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
441 # For gs2 we explicitly make it a paged document, cos greenstone won't get it
442 # right if any section has an empty title, or one with letters in it
443 if (&util::is_gs3()) {
444 # but for gs3, paged docs currently use image slider which is ugly if there are no images
445 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
446 } else {
447 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
448 }
449 }
450
451 return $result;
452}
453
4541;
Note: See TracBrowser for help on using the repository browser.