source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 24349

Last change on this file since 24349 was 24290, checked in by sjm84, 13 years ago

Several changes to how Greenstone hashes PDF files and also added several more options to the EmbeddedMetadataPlugin

  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
RevLine 
[1410]1###########################################################################
2#
[15872]3# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[2661]8# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[15872]25package PDFPlugin;
[1410]26
[10353]27use strict;
[22702]28no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
[1410]29
[22705]30use ReadTextFile;
31use unicode;
[22702]32
[22861]33use AutoLoadConverters;
[22864]34use ConvertBinaryFile;
[1410]35
[22861]36@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
[22705]37
38
[10452]39my $convert_to_list =
40 [ { 'name' => "auto",
[15872]41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]42 { 'name' => "html",
[15872]43 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]44 { 'name' => "text",
[15872]45 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10452]46 { 'name' => "pagedimg_jpg",
[15872]47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]48 { 'name' => "pagedimg_gif",
[15872]49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]50 { 'name' => "pagedimg_png",
[15872]51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]52 ];
53
54
55my $arguments =
[10889]56 [
57 { 'name' => "convert_to",
[15872]58 'desc' => "{ConvertBinaryFile.convert_to}",
[10889]59 'type' => "enum",
60 'reqd' => "yes",
61 'list' => $convert_to_list,
62 'deft' => "html" },
63 { 'name' => "process_exp",
[15872]64 'desc' => "{BasePlugin.process_exp}",
[10889]65 'type' => "regexp",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "block_exp",
[15872]69 'desc' => "{BasePlugin.block_exp}",
[10889]70 'type' => "regexp",
71 'deft' => &get_default_block_exp() },
72 { 'name' => "metadata_fields",
[15872]73 'desc' => "{HTMLPlugin.metadata_fields}",
[10889]74 'type' => "string",
75 'deft' => "" },
[21800]76 { 'name' => "metadata_field_separator",
77 'desc' => "{HTMLPlugin.metadata_field_separator}",
78 'type' => "string",
79 'deft' => "" },
[10889]80 { 'name' => "noimages",
[15872]81 'desc' => "{PDFPlugin.noimages}",
[10889]82 'type' => "flag" },
83 { 'name' => "allowimagesonly",
[15872]84 'desc' => "{PDFPlugin.allowimagesonly}",
[10889]85 'type' => "flag" },
86 { 'name' => "complex",
[15872]87 'desc' => "{PDFPlugin.complex}",
[10889]88 'type' => "flag" },
89 { 'name' => "nohidden",
[15872]90 'desc' => "{PDFPlugin.nohidden}",
[10889]91 'type' => "flag" },
92 { 'name' => "zoom",
[15872]93 'desc' => "{PDFPlugin.zoom}",
[10889]94 'deft' => "2",
95 'range' => "1,3", # actually the range is 0.5-3
96 'type' => "int" },
97 { 'name' => "use_sections",
[15872]98 'desc' => "{PDFPlugin.use_sections}",
[10889]99 'type' => "flag" },
100 { 'name' => "description_tags",
[15872]101 'desc' => "{HTMLPlugin.description_tags}",
[10889]102 'type' => "flag" }
103 ];
[3540]104
[15872]105my $options = { 'name' => "PDFPlugin",
106 'desc' => "{PDFPlugin.desc}",
[6408]107 'abstract' => "no",
[3540]108 'inherits' => "yes",
[15114]109 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
[3540]110 'args' => $arguments };
111
[1410]112sub new {
[10218]113 my ($class) = shift (@_);
114 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
115 push(@$pluginlist, $class);
[2452]116
[10218]117 push(@$inputargs,"-title_sub");
118 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]119
[15872]120 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
121 push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]122
[22861]123 my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
124 my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
125 my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
[10353]126
[10580]127 if ($self->{'info_only'}) {
128 # don't worry about any options etc
129 return bless $self, $class;
130 }
[22861]131
132 $self = bless $self, $class;
[15872]133 $self->{'file_type'} = "PDF";
134
135 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]136 my $zoom = $self->{"zoom"};
[3720]137 $self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]138 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
139 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
140 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]141 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]142
[22597]143 # check convert_to
144 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
145 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
146 $self->{'convert_to'} = "html";
[10273]147 }
[22597]148 elsif ($self->{'convert_to'} eq "auto") {
149 # choose html ?? is this the best option
150 $self->{'convert_to'} = "html";
[10273]151 }
[22597]152 # set convert_to_plugin and convert_to_ext
[22702]153 $self->set_standard_convert_settings();
[18145]154
[22597]155 my $secondary_plugin_name = $self->{'convert_to_plugin'};
156 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10273]157
[22597]158 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
159 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10724]160 }
[22597]161 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10429]162
[10273]163 # following title_sub removes "Page 1" added by pdftohtml, and a leading
164 # "1", which is often the page number at the top of the page. Bad Luck
165 # if your document title actually starts with "1 " - is there a better way?
[22597]166 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]167 my $associate_tail_re = $self->{'associate_tail_re'};
168 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[22597]169 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
[11122]170 }
[22597]171 push(@$specific_options, "-file_rename_method", "none");
172
173 if ($secondary_plugin_name eq "HTMLPlugin") {
[22861]174 # pdftohtml always produces utf8 - What about pdfbox???
[24290]175 # push(@$specific_options, "-input_encoding", "utf8");
[22597]176 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
177 push(@$specific_options, "-processing_tmp_files");
178 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
179 # to extract these metadata fields from the HEAD META fields
180 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
181 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
182 } else {
183 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
184 }
185 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
186 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
187 }
188 if ($self->{'use_sections'} || $self->{'description_tags'}) {
189 $self->{'description_tags'} = 1;
190 push(@$specific_options, "-description_tags");
191 }
192 }
193 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
194 push(@$specific_options, "-screenviewsize", "1000");
195 push(@$specific_options, "-enable_cache");
196 push(@$specific_options, "-processing_tmp_files");
197 }
[11122]198
[10273]199 $self = bless $self, $class;
[10429]200 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]201 return $self;
[1410]202}
203
204sub get_default_process_exp {
205 my $self = shift (@_);
206
207 return q^(?i)\.pdf$^;
208}
[2661]209
210# so we don't inherit HTMLPlug's block exp...
211sub get_default_block_exp {
212 return "";
213}
[22861]214
215sub init {
216 my $self = shift (@_);
217
218 # ConvertBinaryFile init
219 $self->SUPER::init(@_);
[23754]220 $self->AutoLoadConverters::init(@_);
[22861]221
222}
223
224sub begin {
225 my $self = shift (@_);
226
[23754]227 $self->AutoLoadConverters::begin(@_);
[22861]228 $self->SUPER::begin(@_);
229
230}
231
232sub deinit {
233 my $self = shift (@_);
[1410]234
[23754]235 $self->AutoLoadConverters::deinit(@_);
[22861]236 $self->SUPER::deinit(@_);
237
238}
239
[24290]240# By setting hashing to be on ga xml this ensures that two
241# PDF files that are identical except for the metadata
242# to hash to different values. Without this, when each PDF
243# file is converted to HTML there is a chance that they
244# will both be *identical* if the conversion utility does
245# not embed the metadata in the generated HTML. This is
246# certainly the case when PDFBOX is being used.
[22861]247
[24290]248# This change makes this convert to based plugin more
249# consistent with the original vision that the same document
250# with different metadata should
251# be seen as different.
252
253sub get_oid_hash_type {
254 my $self = shift (@_);
255 return "hash_on_ga_xml";
256}
257
258
[22861]259sub tmp_area_convert_file {
260
261 my $self = shift (@_);
262 return $self->AutoLoadConverters::tmp_area_convert_file(@_);
263
264}
265
[10273]266sub convert_post_process
267{
[1410]268 my $self = shift (@_);
[10273]269 my ($conv_filename) = @_;
[9465]270
[7019]271 my $outhandle=$self->{'outhandle'};
272
[15963]273 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
274 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]275
[10273]276 # read in file ($text will be in utf8)
277 my $text = "";
[15963]278 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
279 $self->read_file ($conv_filename, "utf8", "", \$text);
[10273]280
[24159]281 # To support the use_sections option with PDFBox: Greenstone splits PDFs into pages for
282 # sections. The PDFPlugin code wants each new page to be prefixed with <a name=pagenum></a>,
283 # which it then splits on to generate page-based sections. However, that's not what PDFBox
284 # generates in its HTML output. Fortunately, PDFBox does have its own page-separator: it
285 # embeds each page in an extra div. The div opener is:
286 # <div style=\"page-break-before:always; page-break-after:always\">
287 # The PDFPlugin now looks for this and prefix <a name=0></a> to each such div. (The
288 # pagenumber is fixed at 0 since I'm unable to work out how to increment the pagenum during
289 # a regex substitution even with regex extensions on.) Later, when we process each section
290 # to get the pagenum, PDFBox's output for this is pre-processed by having a loopcounter
291 # that increments the pagenum for each subsequent section.
292
293 #$pdfbox_pageheader="\<div style=\"page-break-before:always; page-break-after:always\">";
294 my $loopcounter = 0; # used later on!
295 $text =~ s@\<div style=\"page-break-before:always; page-break-after:always\">@<a name=$loopcounter></a><div style=\"page-break-before:always; page-break-after:always\">@g;
296
297
[10273]298 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
299 # for each page). Metadata based on this calculation not set until process()
300 #
301 # Note: this is done even if we are not breaking to document into pages as it might
302 # be useful to give an indication of document length in browser through setting
303 # num_pages as metadata.
[24159]304 my @pages = ($text =~ m/(\<[Aa] name=\"?\w+\"?>|\<div style=\"page-break-before:always; page-break-after:always\">)/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
[10273]305 my $num_pages = scalar(@pages);
306 $self->{'num_pages'} = $num_pages;
307
[3411]308 if ($self->{'use_sections'}
309 && $self->{'converted_to'} eq "HTML") {
310
[15872]311 print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]312
[3614]313 # we have "<a name=1></a>" etc for each page
[8795]314 # it may be <A name=
[10273]315 my @sections = split('<[Aa] name=', $text);
[3411]316
[10273]317 my $top_section = "";
318
[7019]319 if (scalar (@sections) == 1) { #only one section - no split!
[15872]320 print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]321 } else {
[10273]322 $top_section .= shift @sections; # keep HTML header etc as top_section
[7019]323 }
324
[3411]325 # handle first section specially for title? Or all use first 100...
326
327 my $title = $sections[0];
[8795]328 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]329 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
330 $title =~ s/<[^>]*>/ /g;
331 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
332 $title =~ s/^\s+//s;
333 $title =~ s/\s+$//;
334 $title =~ s/\s+/ /gs;
335 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
336 $title =~ s/^\s+//s; # in case title_sub introduced any...
337 $title = substr ($title, 0, 100);
338 $title =~ s/\s\S*$/.../;
339
[10273]340
[7019]341 if (scalar (@sections) == 1) { # no sections found
[10273]342 $top_section .= $sections[0];
[7019]343 @sections=();
344 } else {
[10273]345 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]346 }
[3411]347
348 # add metadata per section...
349 foreach my $section (@sections) {
[8795]350 # section names are not always just digits, may be like "outline"
351 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]352
[3411]353 $title = $1; # Greenstone does magic if sections are titled digits
[24159]354
355 # A title of pagenum=0 means use_sections is being applied on output from PDFBox,
356 # which didn't originally have a <a name=incremented pagenumber></a> to split each page.
357 # Our Perl code then prefixed <a name=0></a> to it. Now need to increment the pagenum here:
358 if($loopcounter > 0 || ($title eq 0 && $loopcounter == 0)) { # implies use_sections with PDFBox
359 $title = ++$loopcounter;
360 }
361
[3411]362 if (! defined($title) ) {
363 print STDERR "no title: $section\n";
[8795]364 $title = " "; # get rid of the undefined warning in next line
[3411]365 }
[15872]366 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]367 $newsection .= "<Metadata name=\"Title\">" . $title
[24159]368 . "</Metadata>\n--><br />\n";
[3411]369 $newsection .= $section;
370 $newsection .= "<!--</Section>-->\n";
371 $section = $newsection;
372 }
373
[10273]374 $text=join('', ($top_section, @sections));
[3411]375 }
376
[24199]377 if ($self->{'use_sections'}
378 && $self->{'converted_to'} eq "text") {
379 print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
380 }
[22953]381
[24199]382
[22953]383 # The following should no longer be needed, now that strings
384 # read in are Unicode aware (in the Perl sense) rather than
385 # raw binary strings that just happen to be UTF-8 compliant
386
[8218]387 # turn any high bytes that aren't valid utf-8 into utf-8.
[22953]388## unicode::ensure_utf8(\$text);
[8218]389
[10273]390 # Write it out again!
391 $self->utf8_write_file (\$text, $conv_filename);
392}
[7287]393
394
[10273]395# do plugin specific processing of doc_obj for HTML type
396sub process {
397 my $self = shift (@_);
[15872]398 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]399
[15963]400 my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]401
[8226]402 # fix up the extracted date metadata to be in Greenstone date format,
403 # and fix the capitalisation of 'date'
[8227]404 my $cursection = $doc_obj->get_top_section();
405 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]406 $doc_obj->delete_metadata($cursection, "date", $datemeta);
407
408 # We're just interested in the date bit, not the time
[8278]409 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
410 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
411 # extracts the ModDate, so it is 0...
412 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
413 my ($year, $month, $day) = ($1,$2,$3);
414 if (defined($year) && defined($month) && defined($day)) {
415 if ($year == 0) {next}
416 if ($year < 100) {$year += 1900} # just to be safe
417 if ($month =~ /^\d$/) {$month="0$month"} # single digit
418 if ($day =~ /^\d$/) {$day="0$day"} # single digit
419 my $date="$year$month$day";
420 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
421 }
[7287]422 }
423
[10273]424 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
[8795]425
426 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
427 # we explicitly make it a paged document, cos greenstone won't get it
428 # right if any section has an empty title, or one with letters in it
429 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
430 }
[10273]431
[7287]432 return $result;
[1410]433}
434
4351;
Note: See TracBrowser for help on using the repository browser.