source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 3833

Last change on this file since 3833 was 3833, checked in by jrm21, 21 years ago

fixed up parsing the use_sections argument.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1###########################################################################
2#
3# PDFPlug.pm -- reasonably with-it pdf plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PDFPlug;
27
28use ConvertToPlug;
29
30sub BEGIN {
31 @ISA = ('ConvertToPlug');
32}
33
34my $arguments = [ { 'name' => "process_exp",
35 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
36 'type' => "string",
37 'deft' => q^(?i)\.pdf$^,
38 'reqd' => "no" },
39 { 'name' => "block_exp",
40 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
41 'type' => 'string',
42 'deft' => q^^ },
43 { 'name' => "noimages",
44 'desc' => "Don't attempt to extract images from PDF.",
45 'type' => "flag" },
46 { 'name' => "complex",
47 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
48 'type' => "flag" },
49 { 'name' => "zoom",
50 'desc' => "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
51 'type' => "int" }
52 ];
53
54my $options = { 'name' => "PDFPlug",
55 'desc' => "Reasonably with-it pdf plugin.",
56 'inherits' => "yes",
57 'args' => $arguments };
58
59sub new {
60 my $class = shift (@_);
61
62 my ($noimages, $complex, $zoom, $use_sections);
63
64 if (!parsargv::parse(\@_,
65 q^noimages^, \$noimages,
66 q^complex^, \$complex,
67 q^zoom/\d+/2^, \$zoom,
68 q^use_sections/1?/^, \$use_sections,
69 "allow_extra_options")) {
70
71 print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
72 &print_usage();
73 die "\n";
74 }
75
76
77 my @args=@_;
78 if ($use_sections) {
79 push (@args, "-description_tags");
80 }
81
82 # following title_sub removes "Page 1" added by pdftohtml, and a leading
83 # "1", which is often the page number at the top of the page. Bad Luck
84 # if your document title actually starts with "1 " - is there a better way?
85
86 my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
87
88 if ($use_sections) {
89 $self->{'use_sections'}=1;
90 }
91
92 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
93 my $option_list = $self->{'option_list'};
94 push( @{$option_list}, $options );
95
96 # these are passed through to gsConvert.pl by ConvertToPlug.pm
97 $self->{'convert_options'} = "-pdf_zoom $zoom";
98 $self->{'convert_options'} .= " -pdf_complex" if $complex;
99 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
100
101 # pdftohtml will always produce html files encoded as utf-8
102 if ($self->{'input_encoding'} eq "auto") {
103 $self->{'input_encoding'} = "utf8";
104 $self->{'extract_language'} = 1;
105 }
106
107 return bless $self, $class;
108}
109
110
111sub print_usage {
112 print STDERR "\n usage: plugin PDFPlug [options]\n\n";
113 print STDERR " options:\n";
114 print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
115 print STDERR " -use_sections Create a separate section for each page\n";
116 print STDERR " of the PDF file.\n";
117 print STDERR " -noimages Don't attempt to extract images from PDF.\n";
118 print STDERR " -complex Create more complex output. With this option\n";
119 print STDERR " set the output html will look much more like\n";
120 print STDERR " the original PDF file. For this to function\n";
121 print STDERR " properly you Ghostscript installed (for *nix\n";
122 print STDERR " gs should be on your path while for windows\n";
123 print STDERR " you must have gswin32c.exe on your path).\n";
124 print STDERR " -zoom The factor by which to zoomthe PDF for output\n";
125 print STDERR " (this is only useful if -complex is set).\n\n";
126}
127
128
129
130sub get_default_process_exp {
131 my $self = shift (@_);
132
133 return q^(?i)\.pdf$^;
134}
135
136# so we don't inherit HTMLPlug's block exp...
137sub get_default_block_exp {
138 return "";
139}
140
141
142# do plugin specific processing of doc_obj for HTML type
143sub process {
144 my $self = shift (@_);
145 if ($self->{'use_sections'}
146 && $self->{'converted_to'} eq "HTML") {
147
148 print STDERR "PDFPlug: Calculating sections...\n";
149 my $textref=$_[0];
150
151 # we have "<a name=1></a>" etc for each page
152 my @sections = split('<a name=', $$textref);
153
154 shift @sections; # don't need HTML header, etc
155 # handle first section specially for title? Or all use first 100...
156
157 my $title = $sections[0];
158 $title =~ s/^\d+>//; # specific for pdftohtml...
159 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
160 $title =~ s/<[^>]*>/ /g;
161 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
162 $title =~ s/^\s+//s;
163 $title =~ s/\s+$//;
164 $title =~ s/\s+/ /gs;
165 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
166 $title =~ s/^\s+//s; # in case title_sub introduced any...
167 $title = substr ($title, 0, 100);
168 $title =~ s/\s\S*$/.../;
169
170 my $top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
171
172 # add metadata per section...
173 foreach my $section (@sections) {
174 $section =~ s@^(\d+)></a>@@; # leftover from split expression...
175
176 $title = $1; # Greenstone does magic if sections are titled digits
177 if (! defined($title) ) {
178 print STDERR "no title: $section\n";
179 }
180 my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
181 $newsection .= "<Metadata name=\"Title\">" . $title
182 . "</Metadata>\n--><p>\n";
183 $newsection .= $section;
184 $newsection .= "<!--</Section>-->\n";
185 $section = $newsection;
186 }
187
188 $$textref=join('', ($top_section, @sections));
189 }
190
191 my $outhandle = $self->{'outhandle'};
192 print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
193 if $self->{'verbosity'} > 1;
194
195 return ConvertToPlug::process_type($self,"pdf",@_);
196}
197
1981;
Note: See TracBrowser for help on using the repository browser.