source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 6812

Last change on this file since 6812 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PDFPlug;
27
28use ConvertToPlug;
29
30sub BEGIN {
31 @ISA = ('ConvertToPlug');
32}
33
34my $arguments =
35 [ { 'name' => "process_exp",
36 'desc' => "{BasPlug.process_exp}",
37 'type' => "regexp",
38 'deft' => &get_default_process_exp(),
39 'reqd' => "no" },
40 { 'name' => "block_exp",
41 'desc' => "{BasPlug.block_exp}",
42 'type' => "regexp",
43 'deft' => &get_default_block_exp() },
44 { 'name' => "noimages",
45 'desc' => "{PDFPlug.noimages}",
46 'type' => "flag" },
47 { 'name' => "complex",
48 'desc' => "{PDFPlug.complex}",
49 'type' => "flag" },
50 { 'name' => "nohidden",
51 'desc' => "{PDFPlug.nohidden}",
52 'type' => "flag" },
53 { 'name' => "zoom",
54 'desc' => "{PDFPlug.zoom}",
55 'deft' => "2",
56 'type' => "int" },
57 { 'name' => "use_sections",
58 'desc' => "{PDFPlug.use_sections}",
59 'type' => "flag" } ];
60
61my $options = { 'name' => "PDFPlug",
62 'desc' => "{PDFPlug.desc}",
63 'abstract' => "no",
64 'inherits' => "yes",
65 'args' => $arguments };
66
67sub new {
68 my $class = shift (@_);
69
70 my ($noimages, $complex, $zoom, $use_sections, $nohidden);
71
72 my @args=@_;
73
74 if (!parsargv::parse(\@_,
75 q^noimages^, \$noimages,
76 q^complex^, \$complex,
77 q^zoom/\d+/2^, \$zoom,
78 q^nohidden^, \$nohidden,
79 q^use_sections^, \$use_sections,
80 "allow_extra_options")) {
81
82 print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
83 local $self = new ConvertToPlug($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
84 $self->print_txt_usage(""); # Use default resource bundle
85 die "\n";
86 }
87
88
89 if ($use_sections) {
90 push (@args, "-description_tags");
91 }
92
93 # following title_sub removes "Page 1" added by pdftohtml, and a leading
94 # "1", which is often the page number at the top of the page. Bad Luck
95 # if your document title actually starts with "1 " - is there a better way?
96
97 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
98 $self->{'plugin_type'} = "PDFPlug";
99 if ($use_sections) {
100 $self->{'use_sections'}=1;
101 }
102
103 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
104 my $option_list = $self->{'option_list'};
105 push( @{$option_list}, $options );
106
107 # these are passed through to gsConvert.pl by ConvertToPlug.pm
108 $self->{'convert_options'} = "-pdf_zoom $zoom";
109 $self->{'convert_options'} .= " -pdf_complex" if $complex;
110 $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
111 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
112
113 # pdftohtml will always produce html files encoded as utf-8
114 if ($self->{'input_encoding'} eq "auto") {
115 $self->{'input_encoding'} = "utf8";
116 $self->{'extract_language'} = 1;
117 }
118
119 return bless $self, $class;
120}
121
122
123# sub print_usage {
124# print STDERR "\n usage: plugin PDFPlug [options]\n\n";
125# print STDERR " options:\n";
126# print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
127# print STDERR " -use_sections Create a separate section for each page\n";
128# print STDERR " of the PDF file.\n";
129# print STDERR " -noimages Don't attempt to extract images from PDF.\n";
130# print STDERR " -complex Create more complex output. With this option\n";
131# print STDERR " set the output html will look much more like\n";
132# print STDERR " the original PDF file. For this to function\n";
133# print STDERR " properly you Ghostscript installed (for *nix\n";
134# print STDERR " gs should be on your path while for windows\n";
135# print STDERR " you must have gswin32c.exe on your path).\n";
136# print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n";
137# print STDERR " hidden text. This is only useful if the -complex\n";
138# print STDERR " option is also set.";
139# print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
140# print STDERR " (this is only useful if -complex is set).\n\n";
141# }
142
143
144
145sub get_default_process_exp {
146 my $self = shift (@_);
147
148 return q^(?i)\.pdf$^;
149}
150
151# so we don't inherit HTMLPlug's block exp...
152sub get_default_block_exp {
153 return "";
154}
155
156
157# do plugin specific processing of doc_obj for HTML type
158sub process {
159 my $self = shift (@_);
160 if ($self->{'use_sections'}
161 && $self->{'converted_to'} eq "HTML") {
162
163 print STDERR "PDFPlug: Calculating sections...\n";
164 my $textref=$_[0];
165
166 # we have "<a name=1></a>" etc for each page
167 my @sections = split('<a name=', $$textref);
168
169 shift @sections; # don't need HTML header, etc
170 # handle first section specially for title? Or all use first 100...
171
172 my $title = $sections[0];
173 $title =~ s/^\d+>//; # specific for pdftohtml...
174 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
175 $title =~ s/<[^>]*>/ /g;
176 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
177 $title =~ s/^\s+//s;
178 $title =~ s/\s+$//;
179 $title =~ s/\s+/ /gs;
180 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
181 $title =~ s/^\s+//s; # in case title_sub introduced any...
182 $title = substr ($title, 0, 100);
183 $title =~ s/\s\S*$/.../;
184
185 my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
186
187 # add metadata per section...
188 foreach my $section (@sections) {
189 $section =~ s@^(\d+)></a>@@; # leftover from split expression...
190
191 $title = $1; # Greenstone does magic if sections are titled digits
192 if (! defined($title) ) {
193 print STDERR "no title: $section\n";
194 }
195 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
196 $newsection .= "<Metadata name=\"Title\">" . $title
197 . "</Metadata>\n--><p>\n";
198 $newsection .= $section;
199 $newsection .= "<!--</Section>-->\n";
200 $section = $newsection;
201 }
202
203 $$textref=join('', ($top_section, @sections));
204 }
205
206 my $outhandle = $self->{'outhandle'};
207 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
208 if $self->{'verbosity'} > 1;
209
210 return ConvertToPlug::process_type($self,"pdf",@_);
211}
212
2131;
Note: See TracBrowser for help on using the repository browser.