source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 9465

Last change on this file since 9465 was 9465, checked in by kjdon, 19 years ago

ConvertToPlug now accepts the gli arg in read() and passes it in process(), and all the convert plugins now print a processing message for gli in their process() method.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25use strict;
26no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
27
28package PDFPlug;
29
30use ConvertToPlug;
31use unicode;
32
33sub BEGIN {
34@PDFPlug::ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "block_exp",
44 'desc' => "{BasPlug.block_exp}",
45 'type' => "regexp",
46 'deft' => &get_default_block_exp() },
47 { 'name' => "noimages",
48 'desc' => "{PDFPlug.noimages}",
49 'type' => "flag" },
50 { 'name' => "complex",
51 'desc' => "{PDFPlug.complex}",
52 'type' => "flag" },
53 { 'name' => "nohidden",
54 'desc' => "{PDFPlug.nohidden}",
55 'type' => "flag" },
56 { 'name' => "zoom",
57 'desc' => "{PDFPlug.zoom}",
58 'deft' => "2",
59 'range' => "1,3", # actually the range is 0.5-3
60 'type' => "int" },
61 { 'name' => "use_sections",
62 'desc' => "{PDFPlug.use_sections}",
63 'type' => "flag" } ];
64
65my $options = { 'name' => "PDFPlug",
66 'desc' => "{PDFPlug.desc}",
67 'abstract' => "no",
68 'inherits' => "yes",
69 'args' => $arguments };
70
71sub new {
72 my $class = shift (@_);
73
74 my ($noimages, $complex, $zoom, $use_sections, $nohidden);
75
76 my @args=@_;
77
78 if (!parsargv::parse(\@_,
79 q^noimages^, \$noimages,
80 q^complex^, \$complex,
81 q^zoom/\d+/2^, \$zoom,
82 q^nohidden^, \$nohidden,
83 q^use_sections^, \$use_sections,
84 "allow_extra_options")) {
85
86 my $self = new ConvertToPlug($class, @_);
87 my $outhandle=$self->{'outhandle'};
88 print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
89 $self->print_txt_usage(""); # Use default resource bundle
90 exit 1;
91 }
92
93
94 if ($use_sections) {
95 push (@args, "-description_tags");
96 }
97
98 # following title_sub removes "Page 1" added by pdftohtml, and a leading
99 # "1", which is often the page number at the top of the page. Bad Luck
100 # if your document title actually starts with "1 " - is there a better way?
101
102 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
103 $self->{'plugin_type'} = "PDFPlug";
104 if ($use_sections) {
105 $self->{'use_sections'}=1;
106 }
107
108 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
109 my $option_list = $self->{'option_list'};
110 push( @{$option_list}, $options );
111
112 # these are passed through to gsConvert.pl by ConvertToPlug.pm
113 $self->{'convert_options'} = "-pdf_zoom $zoom";
114 $self->{'convert_options'} .= " -pdf_complex" if $complex;
115 $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
116 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
117
118 # pdftohtml will always produce html files encoded as utf-8
119 if ($self->{'input_encoding'} eq "auto") {
120 $self->{'input_encoding'} = "utf8";
121 $self->{'extract_language'} = 1;
122 }
123
124 return bless $self, $class;
125}
126
127sub get_default_process_exp {
128 my $self = shift (@_);
129
130 return q^(?i)\.pdf$^;
131}
132
133# so we don't inherit HTMLPlug's block exp...
134sub get_default_block_exp {
135 return "";
136}
137
138
139# do plugin specific processing of doc_obj for HTML type
140sub process {
141 my $self = shift (@_);
142 #my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
143
144 my $outhandle=$self->{'outhandle'};
145
146 my $textref=$_[0];
147
148 if ($self->{'use_sections'}
149 && $self->{'converted_to'} eq "HTML") {
150
151 print $outhandle "PDFPlug: Calculating sections...\n";
152
153 # we have "<a name=1></a>" etc for each page
154 # it may be <A name=
155 my @sections = split('<[Aa] name=', $$textref);
156
157 if (scalar (@sections) == 1) { #only one section - no split!
158 print $outhandle "PDFPlug: warning - no sections found\n";
159 } else {
160 shift @sections; # don't need HTML header, etc
161 }
162
163 # handle first section specially for title? Or all use first 100...
164
165 my $title = $sections[0];
166 $title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
167 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
168 $title =~ s/<[^>]*>/ /g;
169 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
170 $title =~ s/^\s+//s;
171 $title =~ s/\s+$//;
172 $title =~ s/\s+/ /gs;
173 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
174 $title =~ s/^\s+//s; # in case title_sub introduced any...
175 $title = substr ($title, 0, 100);
176 $title =~ s/\s\S*$/.../;
177
178 my $top_section;
179 if (scalar (@sections) == 1) { # no sections found
180 $top_section=$sections[0];
181 @sections=();
182 } else {
183 $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
184 }
185
186 # add metadata per section...
187 foreach my $section (@sections) {
188 # section names are not always just digits, may be like "outline"
189 $section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
190
191 $title = $1; # Greenstone does magic if sections are titled digits
192 if (! defined($title) ) {
193 print STDERR "no title: $section\n";
194 $title = " "; # get rid of the undefined warning in next line
195 }
196 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
197 $newsection .= "<Metadata name=\"Title\">" . $title
198 . "</Metadata>\n--><p>\n";
199 $newsection .= $section;
200 $newsection .= "<!--</Section>-->\n";
201 $section = $newsection;
202 }
203
204 $$textref=join('', ($top_section, @sections));
205 }
206
207 # turn any high bytes that aren't valid utf-8 into utf-8.
208 unicode::ensure_utf8($textref);
209
210 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
211 if $self->{'verbosity'} > 1;
212 print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($_[6]);
213
214 # tell htmlplug to extract these metadata fields from the HEAD META fields
215 $self->{'metadata_fields'} .= ",date,author<Creator>";
216
217 my $result = ConvertToPlug::process_type($self,"pdf",@_);
218
219 #my $doc_obj = pop(@_);
220 my $doc_obj = $_[5];
221 # fix up the extracted date metadata to be in Greenstone date format,
222 # and fix the capitalisation of 'date'
223 my $cursection = $doc_obj->get_top_section();
224
225 foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
226 $doc_obj->delete_metadata($cursection, "date", $datemeta);
227
228 # We're just interested in the date bit, not the time
229 # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
230 # set a /CreationDate, and set /ModDate to 000000000. pdftohtml
231 # extracts the ModDate, so it is 0...
232 $datemeta =~ /(\d+)-(\d+)-(\d+)/;
233 my ($year, $month, $day) = ($1,$2,$3);
234 if (defined($year) && defined($month) && defined($day)) {
235 if ($year == 0) {next}
236 if ($year < 100) {$year += 1900} # just to be safe
237 if ($month =~ /^\d$/) {$month="0$month"} # single digit
238 if ($day =~ /^\d$/) {$day="0$day"} # single digit
239 my $date="$year$month$day";
240 $doc_obj->add_utf8_metadata($cursection, "Date", $date);
241 }
242 }
243
244 # Add NumPages metadata (we have "<a name=1>" etc for each page)
245 my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
246 $doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
247
248 if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
249 # we explicitly make it a paged document, cos greenstone won't get it
250 # right if any section has an empty title, or one with letters in it
251 $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
252 }
253 return $result;
254}
255
2561;
Note: See TracBrowser for help on using the repository browser.