source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 18339

Last change on this file since 18339 was 18145, checked in by ak19, 16 years ago

When PDFPlugin now converts PDF pages to images it no longer uses the default screenviewsize of 500 but sets it to 1000. This didn't work previously because the convert_to pattern-match on pagedimg failed. It turned out that the pattern should have been looking for PagedImage instead.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
RevLine 
[1410]1###########################################################################
2#
[15872]3# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
[2661]8# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[15872]25package PDFPlugin;
[1410]26
[15872]27use ConvertBinaryFile;
28use ReadTextFile;
[8218]29use unicode;
[10353]30use strict;
31no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
[1410]32
[8716]33sub BEGIN {
[15872]34 @PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
[8716]35}
[1410]36
[10452]37my $convert_to_list =
38 [ { 'name' => "auto",
[15872]39 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]40 { 'name' => "html",
[15872]41 'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]42 { 'name' => "text",
[15872]43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10452]44 { 'name' => "pagedimg_jpg",
[15872]45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]46 { 'name' => "pagedimg_gif",
[15872]47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]48 { 'name' => "pagedimg_png",
[15872]49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]50 ];
51
52
53my $arguments =
[10889]54 [
55 { 'name' => "convert_to",
[15872]56 'desc' => "{ConvertBinaryFile.convert_to}",
[10889]57 'type' => "enum",
58 'reqd' => "yes",
59 'list' => $convert_to_list,
60 'deft' => "html" },
61 { 'name' => "process_exp",
[15872]62 'desc' => "{BasePlugin.process_exp}",
[10889]63 'type' => "regexp",
64 'deft' => &get_default_process_exp(),
65 'reqd' => "no" },
66 { 'name' => "block_exp",
[15872]67 'desc' => "{BasePlugin.block_exp}",
[10889]68 'type' => "regexp",
69 'deft' => &get_default_block_exp() },
70 { 'name' => "metadata_fields",
[15872]71 'desc' => "{HTMLPlugin.metadata_fields}",
[10889]72 'type' => "string",
73 'deft' => "" },
74 { 'name' => "noimages",
[15872]75 'desc' => "{PDFPlugin.noimages}",
[10889]76 'type' => "flag" },
77 { 'name' => "allowimagesonly",
[15872]78 'desc' => "{PDFPlugin.allowimagesonly}",
[10889]79 'type' => "flag" },
80 { 'name' => "complex",
[15872]81 'desc' => "{PDFPlugin.complex}",
[10889]82 'type' => "flag" },
83 { 'name' => "nohidden",
[15872]84 'desc' => "{PDFPlugin.nohidden}",
[10889]85 'type' => "flag" },
86 { 'name' => "zoom",
[15872]87 'desc' => "{PDFPlugin.zoom}",
[10889]88 'deft' => "2",
89 'range' => "1,3", # actually the range is 0.5-3
90 'type' => "int" },
91 { 'name' => "use_sections",
[15872]92 'desc' => "{PDFPlugin.use_sections}",
[10889]93 'type' => "flag" },
94 { 'name' => "description_tags",
[15872]95 'desc' => "{HTMLPlugin.description_tags}",
[10889]96 'type' => "flag" }
97 ];
[3540]98
[15872]99my $options = { 'name' => "PDFPlugin",
100 'desc' => "{PDFPlugin.desc}",
[6408]101 'abstract' => "no",
[3540]102 'inherits' => "yes",
[15114]103 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
[3540]104 'args' => $arguments };
105
[1410]106sub new {
[10218]107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
[2452]110
[10218]111 push(@$inputargs,"-title_sub");
112 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]113
[15872]114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]116
117 my @arg_array = @$inputargs;
[15872]118 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[10353]119
[10580]120 if ($self->{'info_only'}) {
121 # don't worry about any options etc
122 return bless $self, $class;
123 }
124
[15872]125 $self->{'filename_extension'} = "pdf";
126 $self->{'file_type'} = "PDF";
127
128 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]129 my $zoom = $self->{"zoom"};
[3720]130 $self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]131 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
132 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
133 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]134 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]135
[10273]136 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
137
[15872]138 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
139 $secondary_plugin_options->{'HTMLPlugin'} = [];
[10273]140 }
[15872]141 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
142 $secondary_plugin_options->{'TextPlugin'} = [];
[10273]143 }
[18145]144 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
[15872]145 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
146 $secondary_plugin_options->{'PagedImagePlugin'} = [];
147 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
[10429]148 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[18145]149 push(@$pagedimg_options, "-screenviewsize", "1000");
[10353]150 }
151 }
[15872]152 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
153 my $text_options = $secondary_plugin_options->{'TextPlugin'};
154 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
[18145]155
[15904]156# if ($self->{'input_encoding'} eq "auto") {
157# $self->{'input_encoding'} = "utf8";
158# }
[10273]159
[10835]160 # if pdftohtml is always producing utf8, then htmlplug always needs this option
161 push(@$html_options,"-input_encoding", "utf8");
[12834]162 push(@$html_options,"-extract_language") if $self->{'extract_language'};
[10273]163 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
164 # to extract these metadata fields from the HEAD META fields
[10724]165 my $required_metadata;
166 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
167 push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
168 } else {
169 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
170 }
171 #push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
172
[10514]173 if ($self->{'use_sections'} || $self->{'description_tags'}) {
[10273]174 $self->{'description_tags'} = 1;
175 push(@$html_options,"-description_tags");
176 }
[10429]177
[10273]178 # following title_sub removes "Page 1" added by pdftohtml, and a leading
179 # "1", which is often the page number at the top of the page. Bad Luck
180 # if your document title actually starts with "1 " - is there a better way?
181 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
182 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]183
184 my $associate_tail_re = $self->{'associate_tail_re'};
185 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
186 push(@$html_options, "-associate_tail_re", $associate_tail_re);
187 push(@$text_options, "-associate_tail_re", $associate_tail_re);
188 push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re);
189 }
190
[10429]191
[10273]192 $self = bless $self, $class;
[10429]193 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]194 return $self;
[1410]195}
196
197sub get_default_process_exp {
198 my $self = shift (@_);
199
200 return q^(?i)\.pdf$^;
201}
[2661]202
203# so we don't inherit HTMLPlug's block exp...
204sub get_default_block_exp {
205 return "";
206}
[1410]207
[10273]208sub convert_post_process
209{
[1410]210 my $self = shift (@_);
[10273]211 my ($conv_filename) = @_;
[9465]212
[7019]213 my $outhandle=$self->{'outhandle'};
214
[15963]215 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
216 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]217
[10273]218 # read in file ($text will be in utf8)
219 my $text = "";
[15963]220 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
221 $self->read_file ($conv_filename, "utf8", "", \$text);
[10273]222
223 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
224 # for each page). Metadata based on this calculation not set until process()
225 #
226 # Note: this is done even if we are not breaking to document into pages as it might
227 # be useful to give an indication of document length in browser through setting
228 # num_pages as metadata.
229 my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
230 my $num_pages = scalar(@pages);
231 $self->{'num_pages'} = $num_pages;
232
[3411]233 if ($self->{'use_sections'}
234 && $self->{'converted_to'} eq "HTML") {
235
[15872]236 print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]237
[3614]238 # we have "<a name=1></a>" etc for each page
[8795]239 # it may be <A name=
[10273]240 my @sections = split('<[Aa] name=', $text);
[3411]241
[10273]242 my $top_section = "";
243
[7019]244 if (scalar (@sections) == 1) { #only one section - no split!
[15872]245 print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]246 } else {
[10273]247 $top_section .= shift @sections; # keep HTML header etc as top_section
[7019]248 }
249
[3411]250 # handle first section specially for title? Or all use first 100...
251
252 my $title = $sections[0];
[8795]253 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]254 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
255 $title =~ s/<[^>]*>/ /g;
256 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
257 $title =~ s/^\s+//s;
258 $title =~ s/\s+$//;
259 $title =~ s/\s+/ /gs;
260 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
261 $title =~ s/^\s+//s; # in case title_sub introduced any...
262 $title = substr ($title, 0, 100);
263 $title =~ s/\s\S*$/.../;
264
[10273]265
[7019]266 if (scalar (@sections) == 1) { # no sections found
[10273]267 $top_section .= $sections[0];
[7019]268 @sections=();
269 } else {
[10273]270 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]271 }
[3411]272
273 # add metadata per section...
274 foreach my $section (@sections) {
[8795]275 # section names are not always just digits, may be like "outline"
276 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]277
[3411]278 $title = $1; # Greenstone does magic if sections are titled digits
279 if (! defined($title) ) {
280 print STDERR "no title: $section\n";
[8795]281 $title = " "; # get rid of the undefined warning in next line
[3411]282 }
[15872]283 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]284 $newsection .= "<Metadata name=\"Title\">" . $title
285 . "</Metadata>\n--><p>\n";
286 $newsection .= $section;
287 $newsection .= "<!--</Section>-->\n";
288 $section = $newsection;
289 }
290
[10273]291 $text=join('', ($top_section, @sections));
[3411]292 }
293
[8218]294 # turn any high bytes that aren't valid utf-8 into utf-8.
[10273]295 unicode::ensure_utf8(\$text);
[8218]296
[10273]297 # Write it out again!
298 $self->utf8_write_file (\$text, $conv_filename);
299}
[7287]300
301
[10273]302# do plugin specific processing of doc_obj for HTML type
303sub process {
304 my $self = shift (@_);
[15872]305 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]306
[15963]307 my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]308
[8226]309 # fix up the extracted date metadata to be in Greenstone date format,
310 # and fix the capitalisation of 'date'
[8227]311 my $cursection = $doc_obj->get_top_section();
312 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]313 $doc_obj->delete_metadata($cursection, "date", $datemeta);
314
315 # We're just interested in the date bit, not the time
[8278]316 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
317 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
318 # extracts the ModDate, so it is 0...
319 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
320 my ($year, $month, $day) = ($1,$2,$3);
321 if (defined($year) && defined($month) && defined($day)) {
322 if ($year == 0) {next}
323 if ($year < 100) {$year += 1900} # just to be safe
324 if ($month =~ /^\d$/) {$month="0$month"} # single digit
325 if ($day =~ /^\d$/) {$day="0$day"} # single digit
326 my $date="$year$month$day";
327 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
328 }
[7287]329 }
330
[10273]331 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
[8795]332
333 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
334 # we explicitly make it a paged document, cos greenstone won't get it
335 # right if any section has an empty title, or one with letters in it
336 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
337 }
[10273]338
[7287]339 return $result;
[1410]340}
341
3421;
Note: See TracBrowser for help on using the repository browser.