source: main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

File size: 15.1 KB
Line 
1###########################################################################
2#
3# PDFv1Plugin.pm -- The older pdf plugin, which uses the older pdftohtml
4# tool that can't handle newer versions of PDFs.
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2018 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26package PDFv1Plugin;
27
28use strict;
29no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
30no strict 'subs'; # allow filehandles to be variables and viceversa
31
32use ConvertBinaryFile;
33use ReadTextFile;
34use unicode;
35
36
37@PDFv1Plugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
38
39# PDFv1 plugin should be returned to being more like it was before AutoLoadConverters/PDFBox extension's inclusion
40# like the PDFPlugin was at http://trac.greenstone.org/browser/main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm?rev=22597
41my $convert_to_list =
42 [ { 'name' => "auto",
43 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44 { 'name' => "html",
45 'desc' => "{ConvertBinaryFile.convert_to.html}" },
46 { 'name' => "text",
47 'desc' => "{ConvertBinaryFile.convert_to.text}" },
48 { 'name' => "pagedimg_jpg",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
50 { 'name' => "pagedimg_gif",
51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
52 { 'name' => "pagedimg_png",
53 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
54 ];
55
56
57my $arguments =
58 [
59 { 'name' => "convert_to",
60 'desc' => "{ConvertBinaryFile.convert_to}",
61 'type' => "enum",
62 'reqd' => "yes",
63 'list' => $convert_to_list,
64 'deft' => "html" },
65 { 'name' => "process_exp",
66 'desc' => "{BaseImporter.process_exp}",
67 'type' => "regexp",
68 'deft' => &get_default_process_exp(),
69 'reqd' => "no" },
70 { 'name' => "block_exp",
71 'desc' => "{CommonUtil.block_exp}",
72 'type' => "regexp",
73 'deft' => &get_default_block_exp() },
74 { 'name' => "metadata_fields",
75 'desc' => "{HTMLPlugin.metadata_fields}",
76 'type' => "string",
77 'deft' => "Title,Author,Subject,Keywords" },
78 { 'name' => "metadata_field_separator",
79 'desc' => "{HTMLPlugin.metadata_field_separator}",
80 'type' => "string",
81 'deft' => "" },
82 { 'name' => "noimages",
83 'desc' => "{PDFPlugin.noimages}",
84 'type' => "flag" },
85 { 'name' => "allowimagesonly",
86 'desc' => "{PDFPlugin.allowimagesonly}",
87 'type' => "flag" },
88 { 'name' => "complex",
89 'desc' => "{PDFPlugin.complex}",
90 'type' => "flag" },
91 { 'name' => "nohidden",
92 'desc' => "{PDFPlugin.nohidden}",
93 'type' => "flag" },
94 { 'name' => "zoom",
95 'desc' => "{PDFPlugin.zoom}",
96 'deft' => "2",
97 'range' => "1,3", # actually the range is 0.5-3
98 'type' => "int" },
99 { 'name' => "use_sections",
100 'desc' => "{PDFPlugin.use_sections}",
101 'type' => "flag" },
102 { 'name' => "description_tags",
103 'desc' => "{HTMLPlugin.description_tags}",
104 'type' => "flag" },
105 { 'name' => "use_realistic_book",
106 'desc' => "{PDFPlugin.use_realistic_book}",
107 'type' => "flag"}
108 ];
109
110my $options = { 'name' => "PDFv1Plugin",
111 'desc' => "{PDFPlugin.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
115 'args' => $arguments };
116
117sub new {
118 my ($class) = shift (@_);
119 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
120 push(@$pluginlist, $class);
121
122 push(@$inputargs,"-title_sub");
123 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
124
125 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
128 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
129
130 if ($self->{'info_only'}) {
131 # don't worry about any options etc
132 return bless $self, $class;
133 }
134
135 $self = bless $self, $class;
136 $self->{'file_type'} = "PDF";
137
138 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm
139 my $zoom = $self->{"zoom"};
140 $self->{'convert_options'} = "-pdf_tool pdftohtml"; # PDFPluginv1 only ever uses the old pdftohtml conversion tool
141 $self->{'convert_options'} .= " -pdf_zoom $zoom";
142 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
143 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
144 $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
145 $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
146
147 # check convert_to
148 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
149 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
150 print STDERR "*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n";
151 print STDERR "*** Use PDFv2Plugin if you really want pdf to text conversion.\n";
152 $self->{'convert_to'} = "html";
153 }
154 elsif ($self->{'convert_to'} eq "auto") {
155 # choose html ?? is this the best option
156 $self->{'convert_to'} = "html";
157 }
158 if ($self->{'use_realistic_book'}) {
159 if ($self->{'convert_to'} ne "html") {
160 print STDERR "PDFs will be converted to HTML for realistic book functionality\n";
161 $self->{'convert_to'} = "html";
162 }
163 }
164 # set convert_to_plugin and convert_to_ext
165 $self->set_standard_convert_settings();
166
167 my $secondary_plugin_name = $self->{'convert_to_plugin'};
168 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
169
170 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
171 $secondary_plugin_options->{$secondary_plugin_name} = [];
172 }
173 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
174
175 # following title_sub removes "Page 1" added by pdftohtml, and a leading
176 # "1", which is often the page number at the top of the page. Bad Luck
177 # if your document title actually starts with "1 " - is there a better way?
178 push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
179 my $associate_tail_re = $self->{'associate_tail_re'};
180 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
181 push(@$specific_options, "-associate_tail_re", $associate_tail_re);
182 }
183 push(@$specific_options, "-file_rename_method", "none");
184
185 if ($secondary_plugin_name eq "HTMLPlugin") {
186 # pdftohtml always produces utf8
187 push(@$specific_options, "-input_encoding", "utf8");
188 push(@$specific_options, "-extract_language") if $self->{'extract_language'};
189 push(@$specific_options, "-processing_tmp_files");
190 # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
191 # to extract these metadata fields from the HEAD META fields
192 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
193 push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
194 } else {
195 push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
196 }
197 if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
198 push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
199 }
200 if ($self->{'use_sections'} || $self->{'description_tags'}) {
201 $self->{'description_tags'} = 1;
202 push(@$specific_options, "-description_tags");
203 }
204 if ($self->{'use_realistic_book'}) {
205 push(@$specific_options, "-use_realistic_book");
206 }
207 }
208 elsif ($secondary_plugin_name eq "PagedImagePlugin") {
209 push(@$specific_options, "-screenviewsize", "1000");
210 push(@$specific_options, "-enable_cache");
211 push(@$specific_options, "-processing_tmp_files");
212 }
213
214 $self = bless $self, $class; # Q TODO: why does it do this a 2nd time in this function?
215 $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
216 return $self;
217}
218
219sub get_default_process_exp {
220 my $self = shift (@_);
221
222 return q^(?i)\.pdf$^;
223}
224
225# so we don't inherit HTMLPlug's block exp...
226sub get_default_block_exp {
227 return "";
228}
229
230
231# By setting hashing to be on ga xml this ensures that two
232# PDF files that are identical except for the metadata
233# to hash to different values. Without this, when each PDF
234# file is converted to HTML there is a chance that they
235# will both be *identical* if the conversion utility does
236# not embed the metadata in the generated HTML. This is
237# certainly the case when PDFBOX is being used.
238
239# This change makes this convert to based plugin more
240# consistent with the original vision that the same document
241# with different metadata should
242# be seen as different.
243
244sub get_oid_hash_type {
245 my $self = shift (@_);
246 return "hash_on_ga_xml";
247}
248
249
250#sub tmp_area_convert_file {
251#
252# my $self = shift (@_);
253# return $self->AutoLoadConverters::tmp_area_convert_file(@_);
254#
255#}
256
257sub convert_post_process
258{
259 my $self = shift (@_);
260 my ($conv_filename) = @_;
261
262 my $outhandle=$self->{'outhandle'};
263
264 #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
265 #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
266
267 # read in file ($text will be in utf8)
268 my $text = "";
269 # encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
270 $self->read_file ($conv_filename, "utf8", "", \$text);
271
272 # Clean html from low and high surrogates D800–DFFF
273 $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g;
274
275 # Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
276 # for each page). Metadata based on this calculation not set until process()
277 #
278 # Note: this is done even if we are not breaking the document into pages as it might
279 # be useful to give an indication of document length in browser through setting
280 # num_pages as metadata.
281 my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #<div style=\"?page-break-before:always; page-break-after:always\"?>
282 my $num_pages = scalar(@pages);
283 $self->{'num_pages'} = $num_pages;
284
285 if ($self->{'use_sections'}
286 && $self->{'converted_to'} eq "HTML") {
287
288 print $outhandle "PDFPlugin: Calculating sections...\n";
289
290 # we have "<a name=1></a>" etc for each page
291 # it may be <A name=
292 my @sections = split('<[Aa] name=', $text);
293
294 my $top_section = "";
295
296 if (scalar (@sections) == 1) { #only one section - no split!
297 print $outhandle "PDFPlugin: warning - no sections found\n";
298 } else {
299 $top_section .= shift @sections; # keep HTML header etc as top_section
300 }
301
302 # handle first section specially for title? Or all use first 100...
303
304 my $title = $sections[0];
305 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
306 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
307 $title =~ s/<[^>]*>/ /g;
308 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
309 $title =~ s/^\s+//s;
310 $title =~ s/\s+$//;
311 $title =~ s/\s+/ /gs;
312 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
313 $title =~ s/^\s+//s; # in case title_sub introduced any...
314 $title = substr ($title, 0, 100);
315 $title =~ s/\s\S*$/.../;
316
317
318 if (scalar (@sections) == 1) { # no sections found
319 $top_section .= $sections[0];
320 @sections=();
321 } else {
322 $top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
323 }
324
325 # add metadata per section...
326 foreach my $section (@sections) {
327 # section names are not always just digits, may be like "outline"
328 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
329
330 $title = $1; # Greenstone does magic if sections are titled digits
331 if (! defined($title) ) {
332 print STDERR "no title: $section\n";
333 $title = " "; # get rid of the undefined warning in next line
334 }
335 my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
336 $newsection .= "<Metadata name=\"Title\">" . $title
337 . "</Metadata>\n--><br />\n"; #TODO: . "</Metadata>\n--><p>\n";
338 $newsection .= $section;
339 $newsection .= "<!--</Section>-->\n";
340 $section = $newsection;
341 }
342
343 $text=join('', ($top_section, @sections));
344 }
345
346 if ($self->{'use_sections'}
347 && $self->{'converted_to'} eq "text") {
348 print STDERR "**** When converting PDF to text, cannot apply use_sections\n";
349 }
350
351
352 # The following should no longer be needed, now that strings
353 # read in are Unicode aware (in the Perl sense) rather than
354 # raw binary strings that just happen to be UTF-8 compliant
355
356 # turn any high bytes that aren't valid utf-8 into utf-8.
357## unicode::ensure_utf8(\$text);
358
359 # Write it out again!
360 $self->utf8_write_file (\$text, $conv_filename);
361}
362
363
364# do plugin specific processing of doc_obj for HTML type
365sub process {
366 my $self = shift (@_);
367 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
368
369 my $result = $self->process_type($base_dir,$file,$doc_obj);
370
371 # fix up the extracted date metadata to be in Greenstone date format,
372 # and fix the capitalisation of 'date'
373 my $cursection = $doc_obj->get_top_section();
374 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
375 $doc_obj->delete_metadata($cursection, "date", $datemeta);
376
377 # We're just interested in the date bit, not the time
378 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
379 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
380 # extracts the ModDate, so it is 0...
381 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
382 my ($year, $month, $day) = ($1,$2,$3);
383 if (defined($year) && defined($month) && defined($day)) {
384 if ($year == 0) {next}
385 if ($year < 100) {$year += 1900} # just to be safe
386 if ($month =~ /^\d$/) {$month="0$month"} # single digit
387 if ($day =~ /^\d$/) {$day="0$day"} # single digit
388 my $date="$year$month$day";
389 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
390 }
391 }
392
393 $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'};
394
395 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
396 # For gs2 we explicitly make it a paged document, cos greenstone won't get it
397 # right if any section has an empty title, or one with letters in it
398 if (&util::is_gs3()) {
399 # but for gs3, paged docs currently use image slider which is ugly if there are no images
400 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy");
401 } else {
402 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
403 }
404 }
405
406 return $result;
407}
408
4091;
Note: See TracBrowser for help on using the repository browser.