source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1###########################################################################
2#
3# PDFPlugin.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25package PDFPlugin;
26
27use ConvertBinaryFile;
28use ReadTextFile;
29use unicode;
30use strict;
31no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
32
33sub BEGIN {
34 @PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
35}
36
37my $convert_to_list =
38 [ { 'name' => "auto",
39 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
40 { 'name' => "html",
41 'desc' => "{ConvertBinaryFile.convert_to.html}" },
42 { 'name' => "text",
43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
44 { 'name' => "pagedimg_jpg",
45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
46 { 'name' => "pagedimg_gif",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
48 { 'name' => "pagedimg_png",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
50 ];
51
52
53my $arguments =
54 [
55 { 'name' => "convert_to",
56 'desc' => "{ConvertBinaryFile.convert_to}",
57 'type' => "enum",
58 'reqd' => "yes",
59 'list' => $convert_to_list,
60 'deft' => "html" },
61 { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_process_exp(),
65 'reqd' => "no" },
66 { 'name' => "block_exp",
67 'desc' => "{BasePlugin.block_exp}",
68 'type' => "regexp",
69 'deft' => &get_default_block_exp() },
70 { 'name' => "metadata_fields",
71 'desc' => "{HTMLPlugin.metadata_fields}",
72 'type' => "string",
73 'deft' => "" },
74 { 'name' => "noimages",
75 'desc' => "{PDFPlugin.noimages}",
76 'type' => "flag" },
77 { 'name' => "allowimagesonly",
78 'desc' => "{PDFPlugin.allowimagesonly}",
79 'type' => "flag" },
80 { 'name' => "complex",
81 'desc' => "{PDFPlugin.complex}",
82 'type' => "flag" },
83 { 'name' => "nohidden",
84 'desc' => "{PDFPlugin.nohidden}",
85 'type' => "flag" },
86 { 'name' => "zoom",
87 'desc' => "{PDFPlugin.zoom}",
88 'deft' => "2",
89 'range' => "1,3", # actually the range is 0.5-3
90 'type' => "int" },
91 { 'name' => "use_sections",
92 'desc' => "{PDFPlugin.use_sections}",
93 'type' => "flag" },
94 { 'name' => "description_tags",
95 'desc' => "{HTMLPlugin.description_tags}",
96 'type' => "flag" }
97 ];
98
99my $options = { 'name' => "PDFPlugin",
100 'desc' => "{PDFPlugin.desc}",
101 'abstract' => "no",
102 'inherits' => "yes",
103 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
110
111 push(@$inputargs,"-title_sub");
112 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
113
114 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
115 push(@{$hashArgOptLists->{"OptList"}},$options);
116
117 my @arg_array = @$inputargs;
118 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
119
120 if ($self->{'info_only'}) {
121 # don't worry about any options etc
122 return bless $self, $class;
123 }
124
125 $self->{'filename_extension'} = "pdf";
126 $self->{'file_type'} = "PDF";
127
128 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
129 my $zoom = $self->{"zoom"};
130 $self->{'convert_options'} = "-pdf_zoom $zoom";
131 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
132 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
133 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
134 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
135
136 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
137
138 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
139 $secondary_plugin_options->{'HTMLPlugin'} = [];
140 }
141 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
142 $secondary_plugin_options->{'TextPlugin'} = [];
143 }
144 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
145 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
146 $secondary_plugin_options->{'PagedImagePlugin'} = [];
147 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
148 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
149 }
150 }
151 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
152 my $text_options = $secondary_plugin_options->{'TextPlugin'};
153 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
154
155 if ($self->{'input_encoding'} eq "auto") {
156 $self->{'input_encoding'} = "utf8";
157 }
158
159 # if pdftohtml is always producing utf8, then htmlplug always needs this option
160 push(@$html_options,"-input_encoding", "utf8");
161 push(@$html_options,"-extract_language") if $self->{'extract_language'};
162 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
163 # to extract these metadata fields from the HEAD META fields
164 my $required_metadata;
165 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
166 push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
167 } else {
168 push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
169 }
170 #push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
171
172 if ($self->{'use_sections'} || $self->{'description_tags'}) {
173 $self->{'description_tags'} = 1;
174 push(@$html_options,"-description_tags");
175 }
176
177 # following title_sub removes "Page 1" added by pdftohtml, and a leading
178 # "1", which is often the page number at the top of the page. Bad Luck
179 # if your document title actually starts with "1 " - is there a better way?
180 push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
181 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
182
183 my $associate_tail_re = $self->{'associate_tail_re'};
184 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
185 push(@$html_options, "-associate_tail_re", $associate_tail_re);
186 push(@$text_options, "-associate_tail_re", $associate_tail_re);
187 push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re);
188 }
189
190
191 $self = bless $self, $class;
192 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
193 return $self;
194}
195
196sub get_default_process_exp {
197 my $self = shift (@_);
198
199 return q^(?i)\.pdf$^;
200}
201
202# so we don't inherit HTMLPlug's block exp...
203sub get_default_block_exp {
204 return "";
205}
206
207sub convert_post_process
208{
209 my $self = shift (@_);
210 my ($conv_filename) = @_;
211
212 my $outhandle=$self->{'outhandle'};
213
214 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
215
216 # read in file ($text will be in utf8)
217 my $text = "";
218 $self->read_file ($conv_filename, $encoding, $language, \$text);
219
220 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
221 # for each page). Metadata based on this calculation not set until process()
222 #
223 # Note: this is done even if we are not breaking to document into pages as it might
224 # be useful to give an indication of document length in browser through setting
225 # num_pages as metadata.
226 my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
227 my $num_pages = scalar(@pages);
228 $self->{'num_pages'} = $num_pages;
229
230 if ($self->{'use_sections'}
231 && $self->{'converted_to'} eq "HTML") {
232
233 print $outhandle "PDFPlugin: Calculating sections...\n";
234
235 # we have "<a name=1></a>" etc for each page
236 # it may be <A name=
237 my @sections = split('<[Aa] name=', $text);
238
239 my $top_section = "";
240
241 if (scalar (@sections) == 1) { #only one section - no split!
242 print $outhandle "PDFPlugin: warning - no sections found\n";
243 } else {
244 $top_section .= shift @sections; # keep HTML header etc as top_section
245 }
246
247 # handle first section specially for title? Or all use first 100...
248
249 my $title = $sections[0];
250 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
251 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
252 $title =~ s/<[^>]*>/ /g;
253 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
254 $title =~ s/^\s+//s;
255 $title =~ s/\s+$//;
256 $title =~ s/\s+/ /gs;
257 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
258 $title =~ s/^\s+//s; # in case title_sub introduced any...
259 $title = substr ($title, 0, 100);
260 $title =~ s/\s\S*$/.../;
261
262
263 if (scalar (@sections) == 1) { # no sections found
264 $top_section .= $sections[0];
265 @sections=();
266 } else {
267 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
268 }
269
270 # add metadata per section...
271 foreach my $section (@sections) {
272 # section names are not always just digits, may be like "outline"
273 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
274
275 $title = $1; # Greenstone does magic if sections are titled digits
276 if (! defined($title) ) {
277 print STDERR "no title: $section\n";
278 $title = " "; # get rid of the undefined warning in next line
279 }
280 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
281 $newsection .= "<Metadata name=\"Title\">" . $title
282 . "</Metadata>\n--><p>\n";
283 $newsection .= $section;
284 $newsection .= "<!--</Section>-->\n";
285 $section = $newsection;
286 }
287
288 $text=join('', ($top_section, @sections));
289 }
290
291 # turn any high bytes that aren't valid utf-8 into utf-8.
292 unicode::ensure_utf8(\$text);
293
294 # Write it out again!
295 $self->utf8_write_file (\$text, $conv_filename);
296}
297
298
299# do plugin specific processing of doc_obj for HTML type
300sub process {
301 my $self = shift (@_);
302 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
303
304 my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj);
305
306 # fix up the extracted date metadata to be in Greenstone date format,
307 # and fix the capitalisation of 'date'
308 my $cursection = $doc_obj->get_top_section();
309 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
310 $doc_obj->delete_metadata($cursection, "date", $datemeta);
311
312 # We're just interested in the date bit, not the time
313 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
314 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
315 # extracts the ModDate, so it is 0...
316 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
317 my ($year, $month, $day) = ($1,$2,$3);
318 if (defined($year) && defined($month) && defined($day)) {
319 if ($year == 0) {next}
320 if ($year < 100) {$year += 1900} # just to be safe
321 if ($month =~ /^\d$/) {$month="0$month"} # single digit
322 if ($day =~ /^\d$/) {$day="0$day"} # single digit
323 my $date="$year$month$day";
324 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
325 }
326 }
327
328 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
329
330 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
331 # we explicitly make it a paged document, cos greenstone won't get it
332 # right if any section has an empty title, or one with letters in it
333 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
334 }
335
336 return $result;
337}
338
3391;
Note: See TracBrowser for help on using the repository browser.