root/main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm @ 24414

Revision 24414, 16.4 KB (checked in by ak19, 8 years ago)

To do with EmbeddedMetadataPlugin?: 1.mkcol.pl and GLI changes puts the plugin in the bottom four plugins of the plugin pipeline. 2. EmbeddedMetadataPlugin? and PDFPlugin are modified to work together again after the recent changes (introduction of overridable BasePlugin? method can_process_file_for_metadata) which were needed to get the EmbeddedMetadataPlugin? and OAIPlugin to work together.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# PDFPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package PDFPlugin;
26
27use strict;
28no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
29
30use ReadTextFile;
31use unicode;
32
33use AutoLoadConverters;
34use ConvertBinaryFile;
35
36@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
37
38
39my $convert_to_list =
40    [ { 'name' => "auto",
41    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42      { 'name' => "html",
43    'desc' => "{ConvertBinaryFile.convert_to.html}" },
44      { 'name' => "text",
45    'desc' => "{ConvertBinaryFile.convert_to.text}" },
46      { 'name' => "pagedimg_jpg",
47    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
48      { 'name' => "pagedimg_gif",
49    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
50      { 'name' => "pagedimg_png",
51    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
52      ];
53
54
55my $arguments =
56    [
57     { 'name' => "convert_to",
58       'desc' => "{ConvertBinaryFile.convert_to}",
59       'type' => "enum",
60       'reqd' => "yes",
61       'list' => $convert_to_list,
62       'deft' => "html" },   
63     { 'name' => "process_exp",
64       'desc' => "{BasePlugin.process_exp}",
65       'type' => "regexp",
66       'deft' => &get_default_process_exp(),
67       'reqd' => "no" },
68     { 'name' => "block_exp",
69       'desc' => "{BasePlugin.block_exp}",
70       'type' => "regexp",
71       'deft' => &get_default_block_exp() },
72     { 'name' => "metadata_fields",
73       'desc' => "{HTMLPlugin.metadata_fields}",
74       'type' => "string",
75       'deft' => "" },
76      { 'name' => "metadata_field_separator",
77    'desc' => "{HTMLPlugin.metadata_field_separator}",
78    'type' => "string",
79    'deft' => "" },
80     { 'name' => "noimages",
81       'desc' => "{PDFPlugin.noimages}",
82       'type' => "flag" },
83     { 'name' => "allowimagesonly",
84       'desc' => "{PDFPlugin.allowimagesonly}",
85       'type' => "flag" },
86     { 'name' => "complex",
87       'desc' => "{PDFPlugin.complex}",
88       'type' => "flag" },
89     { 'name' => "nohidden",
90       'desc' => "{PDFPlugin.nohidden}",
91       'type' => "flag" },
92     { 'name' => "zoom",
93       'desc' => "{PDFPlugin.zoom}",
94       'deft' => "2",
95       'range' => "1,3", # actually the range is 0.5-3
96       'type' => "int" },
97     { 'name' => "use_sections",
98       'desc' => "{PDFPlugin.use_sections}",
99       'type' => "flag" },
100     { 'name' => "description_tags",
101       'desc' => "{HTMLPlugin.description_tags}",
102       'type' => "flag" }
103     ];
104
105my $options = { 'name'     => "PDFPlugin",
106        'desc'     => "{PDFPlugin.desc}",
107        'abstract' => "no",
108        'inherits' => "yes",
109        'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html     
110        'args'     => $arguments };
111
112sub new {
113    my ($class) = shift (@_);
114    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
115    push(@$pluginlist, $class);
116
117    push(@$inputargs,"-title_sub");
118    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
119
120    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
121    push(@{$hashArgOptLists->{"OptList"}},$options);
122
123    my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
124    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
125    my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
126   
127    if ($self->{'info_only'}) {
128    # don't worry about any options etc
129    return bless $self, $class;
130    }
131   
132    $self = bless $self, $class;
133    $self->{'file_type'} = "PDF";
134
135    # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
136    my $zoom = $self->{"zoom"};
137    $self->{'convert_options'} = "-pdf_zoom $zoom";
138    $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
139    $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
140    $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
141    $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
142
143    # check convert_to
144    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
145    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
146    $self->{'convert_to'} = "html";
147    }
148    elsif ($self->{'convert_to'} eq "auto") {
149    # choose html ?? is this the best option
150    $self->{'convert_to'} = "html";
151    }
152    # set convert_to_plugin and convert_to_ext
153    $self->set_standard_convert_settings();
154
155    my $secondary_plugin_name = $self->{'convert_to_plugin'};
156    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
157
158    if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
159    $secondary_plugin_options->{$secondary_plugin_name} = [];
160    }
161    my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
162
163    # following title_sub removes "Page 1" added by pdftohtml, and a leading
164    # "1", which is often the page number at the top of the page. Bad Luck
165    # if your document title actually starts with "1 " - is there a better way?
166    push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
167    my $associate_tail_re = $self->{'associate_tail_re'};
168    if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
169    push(@$specific_options, "-associate_tail_re", $associate_tail_re);
170    }
171    push(@$specific_options, "-file_rename_method", "none");
172   
173    if ($secondary_plugin_name eq "HTMLPlugin") {
174    # pdftohtml always produces utf8 - What about pdfbox???
175    # push(@$specific_options, "-input_encoding", "utf8");
176    push(@$specific_options, "-extract_language") if $self->{'extract_language'};
177    push(@$specific_options, "-processing_tmp_files");
178    # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
179    # to extract these metadata fields from the HEAD META fields
180    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
181        push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
182    } else {
183        push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
184    }
185    if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
186        push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
187    }
188    if ($self->{'use_sections'} || $self->{'description_tags'}) {
189        $self->{'description_tags'} = 1;
190        push(@$specific_options, "-description_tags");
191    }
192    }
193    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
194    push(@$specific_options, "-screenviewsize", "1000");
195    push(@$specific_options, "-enable_cache");
196    push(@$specific_options, "-processing_tmp_files");
197    }
198
199    $self = bless $self, $class;
200    $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
201    return $self;
202}
203
204sub get_default_process_exp {
205    my $self = shift (@_);
206
207    return q^(?i)\.pdf$^;
208}
209
210# so we don't inherit HTMLPlug's block exp...
211sub get_default_block_exp {
212    return "";
213}
214
215# Even if a plugin can extract metadata in its metadata_read pass,
216# make the default return 'undef' so processing of the file continues
217# down the pipeline, so other plugins can also have the opportunity to
218# locate metadata and set it up in the extrametakeys variables that
219# are passed around.
220sub can_process_this_file_for_metadata {
221    my $self = shift(@_);
222
223    # this plugin will look for metadata in any file through its
224    # metadata_read(). Returning undef here means anything else further
225    # down the pipeline can do the same
226
227    return 0;
228}
229
230sub init {
231    my $self = shift (@_);
232
233    # ConvertBinaryFile init
234    $self->SUPER::init(@_);
235    $self->AutoLoadConverters::init(@_);
236
237}
238
239sub begin {
240    my $self = shift (@_);
241
242    $self->AutoLoadConverters::begin(@_);
243    $self->SUPER::begin(@_);
244
245}
246
247sub deinit {
248    my $self = shift (@_);
249   
250    $self->AutoLoadConverters::deinit(@_);
251    $self->SUPER::deinit(@_);
252
253}
254 
255# By setting hashing to be on ga xml this ensures that two
256# PDF files that are identical except for the metadata
257# to hash to different values. Without this, when each PDF
258# file is converted to HTML there is a chance that they
259# will both be *identical* if the conversion utility does
260# not embed the metadata in the generated HTML. This is
261# certainly the case when PDFBOX is being used.
262
263# This change makes this convert to based plugin more
264# consistent with the original vision that the same document
265# with different metadata should
266# be seen as different.
267
268sub get_oid_hash_type {
269    my $self = shift (@_);
270    return "hash_on_ga_xml";
271}
272 
273 
274sub tmp_area_convert_file {
275
276    my $self = shift (@_);
277    return $self->AutoLoadConverters::tmp_area_convert_file(@_);
278
279}
280
281sub convert_post_process
282{
283    my $self = shift (@_);
284    my ($conv_filename) = @_;
285
286    my $outhandle=$self->{'outhandle'};
287
288    #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
289    #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
290
291    # read in file ($text will be in utf8)
292    my $text = "";
293    # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
294    $self->read_file ($conv_filename, "utf8", "", \$text);
295
296    # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
297    # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
298    # which it then splits on to generate page-based sections. However, that's not what PDFBox
299    # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
300    # embeds each page in an extra div. The div opener is:
301    # <div style=\"page-break-before:always; page-break-after:always\">
302    # The PDFPlugin now looks for this and prefix <a name=0></a> to each such div. (The
303    # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
304    # a regex substitution even with regex extensions on.) Later, when we process each section
305    # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
306    # that increments the pagenum for each subsequent section.
307
308    #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
309    my $loopcounter = 0; # used later on!
310    $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
311
312
313    # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
314    # for each page).  Metadata based on this calculation not set until process()
315    #
316    # Note: this is done even if we are not breaking to document into pages as it might
317    # be useful to give an indication of document length in browser through setting
318    # num_pages as metadata.
319    my @pages = ($text =~ m/(\<[Aa] name=\"?\w+\"?>|\<div style=\"page-break-before:always; page-break-after:always\">)/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
320    my $num_pages = scalar(@pages);
321    $self->{'num_pages'} = $num_pages;
322
323    if ($self->{'use_sections'}
324    && $self->{'converted_to'} eq "HTML") {
325
326    print $outhandle "PDFPlugin: Calculating sections...\n";
327
328    # we have "<a name=1></a>" etc for each page
329    # it may be <A name=
330    my @sections = split('<[Aa] name=', $text);
331
332    my $top_section = "";
333
334    if (scalar (@sections) == 1) { #only one section - no split!
335        print $outhandle "PDFPlugin: warning - no sections found\n";
336    } else {
337        $top_section .= shift @sections; # keep HTML header etc as top_section
338    }
339
340    # handle first section specially for title? Or all use first 100...
341   
342    my $title = $sections[0];
343    $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
344    $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
345    $title =~ s/<[^>]*>/ /g;
346    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
347    $title =~ s/^\s+//s;
348    $title =~ s/\s+$//;
349    $title =~ s/\s+/ /gs;
350    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
351    $title =~ s/^\s+//s; # in case title_sub introduced any...
352    $title = substr ($title, 0, 100);
353    $title =~ s/\s\S*$/.../;
354
355
356    if (scalar (@sections) == 1) { # no sections found
357        $top_section .= $sections[0];
358        @sections=();
359    } else {
360        $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
361    }
362
363    # add metadata per section...
364    foreach my $section (@sections) {
365        # section names are not always just digits, may be like "outline"
366        $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
367
368        $title = $1; # Greenstone does magic if sections are titled digits
369
370        # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
371        # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
372        # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
373        if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
374        $title = ++$loopcounter;
375        }
376
377        if (! defined($title) ) {
378        print STDERR "no title: $section\n";
379        $title = " "; # get rid of the undefined warning in next line
380        }
381        my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
382        $newsection .= "<Metadata name=\"Title\">" . $title
383        . "</Metadata>\n--><br />\n";
384        $newsection .= $section;
385        $newsection .= "<!--</Section>-->\n";
386        $section = $newsection;
387    }
388
389    $text=join('', ($top_section, @sections));
390    }
391
392    if ($self->{'use_sections'}
393    && $self->{'converted_to'} eq "text") {
394    print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
395    }
396
397
398    # The following should no longer be needed, now that strings
399    # read in are Unicode aware (in the Perl sense) rather than
400    # raw binary strings that just happen to be UTF-8 compliant
401
402    # turn any high bytes that aren't valid utf-8 into utf-8.
403##    unicode::ensure_utf8(\$text);
404
405    # Write it out again!
406    $self->utf8_write_file (\$text, $conv_filename);
407}
408
409
410# do plugin specific processing of doc_obj for HTML type
411sub process {
412    my $self = shift (@_);
413    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
414
415    my $result = $self->process_type($base_dir,$file,$doc_obj);
416
417    # fix up the extracted date metadata to be in Greenstone date format,
418    # and fix the capitalisation of 'date'
419    my $cursection = $doc_obj->get_top_section();
420    foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
421    $doc_obj->delete_metadata($cursection, "date", $datemeta);
422
423    # We're just interested in the date bit, not the time
424    # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
425    # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
426    # extracts the ModDate, so it is 0...
427    $datemeta =~ /(\d+)-(\d+)-(\d+)/;
428    my ($year, $month, $day) = ($1,$2,$3);
429    if (defined($year) && defined($month) && defined($day)) {
430        if ($year == 0) {next}
431        if ($year < 100) {$year += 1900} # just to be safe
432        if ($month =~ /^\d$/) {$month="0$month"} # single digit
433        if ($day =~ /^\d$/) {$day="0$day"} # single digit
434        my $date="$year$month$day";
435        $doc_obj->add_utf8_metadata($cursection, "Date", $date);
436    }
437    }
438
439    $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
440   
441    if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
442    # we explicitly make it a paged document, cos greenstone won't get it
443    # right if any section has an empty title, or one with letters in it
444    $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
445    }
446
447    return $result;
448}
449
4501;
Note: See TracBrowser for help on using the browser.