source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1###########################################################################
2#
3# PostScriptPlugin.pm -- plugin to process PostScript files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PostScriptPlugin;
27
28use ConvertBinaryFile;
29use ReadTextFile; # for read_file in convert_post_process. do we need it?
30use sorttools;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35sub BEGIN {
36 @PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37}
38
39my $convert_to_list =
40 [ { 'name' => "auto",
41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42 { 'name' => "text",
43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
44 { 'name' => "pagedimg_jpg",
45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46 { 'name' => "pagedimg_gif",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48 { 'name' => "pagedimg_png",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50 ];
51
52my $arguments =
53 [ { 'name' => "convert_to",
54 'desc' => "{ConvertBinaryFile.convert_to}",
55 'type' => "enum",
56 'reqd' => "yes",
57 'list' => $convert_to_list,
58 'deft' => "text" },
59 { 'name' => "process_exp",
60 'desc' => "{BaseImporter.process_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_process_exp(),
63 'reqd' => "no" },
64 { 'name' => "block_exp",
65 'desc' => "{BaseImporter.block_exp}",
66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "extract_date",
69 'desc' => "{PostScriptPlugin.extract_date}",
70 'type' => "flag" },
71 { 'name' => "extract_pages",
72 'desc' => "{PostScriptPlugin.extract_pages}",
73 'type' => "flag" },
74 { 'name' => "extract_title",
75 'desc' => "{PostScriptPlugin.extract_title}",
76 'type' => "flag" } ];
77
78my $options = { 'name' => "PostScriptPlugin",
79 'desc' => "{PostScriptPlugin.desc}",
80 'abstract' => "no",
81 'inherits' => "yes",
82 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 push(@$inputargs,"-title_sub");
91 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
92
93 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94 push(@{$hashArgOptLists->{"OptList"}},$options);
95
96 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
97
98 if ($self->{'info_only'}) {
99 # don't worry about any options etc
100 return bless $self, $class;
101 }
102
103 $self->{'file_type'} = "PS";
104
105 if ($self->{'convert_to'} eq "auto") {
106 $self->{'convert_to'} = "text";
107 }
108
109 # set convert_to_plugin and convert_to_ext
110 $self->set_standard_convert_settings();
111 my $secondary_plugin_name = $self->{'convert_to_plugin'};
112 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
113
114 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
115 $secondary_plugin_options->{$secondary_plugin_name} = [];
116 }
117 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
118
119 # following title_sub removes "Page 1" added by ps2ascii, and a leading
120 # "1", which is often the page number at the top of the page. Bad Luck
121 # if your document title actually starts with "1 " - is there a better way?
122 push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123 push(@$specific_options, "-file_rename_method", "none");
124
125 if ($secondary_plugin_name eq "TextPlugin") {
126 push(@$specific_options, "-input_encoding", "utf8");
127 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
128 } elsif ($secondary_plugin_name eq "PagedImagePlugin") {
129 push(@$specific_options, "-processing_tmp_files");
130 }
131
132 $self = bless $self, $class;
133 # used for convert_post_process
134 $self->{'input_encoding'} = "auto";
135 $self->{'default_encoding'} = "utf8";
136
137 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
138
139 return $self;
140}
141
142
143sub get_default_block_exp {
144 my $self = shift (@_);
145
146 return q^(?i)\.(eps)$^;
147}
148
149sub get_default_process_exp {
150 my $self = shift (@_);
151
152 return q^(?i)\.ps$^;
153}
154
155# this has been commented out in other plugins. do we need it here?
156# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
157sub convert_post_process
158{
159 my $self = shift (@_);
160 my ($conv_filename) = @_;
161
162 my $outhandle=$self->{'outhandle'};
163
164 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
165
166 # read in file ($text will be in utf8)
167 my $text = "";
168 $self->read_file ($conv_filename, $encoding, $language, \$text);
169
170 # turn any high bytes that aren't valid utf-8 into utf-8.
171 unicode::ensure_utf8(\$text);
172
173 # Write it out again!
174 $self->utf8_write_file (\$text, $conv_filename);
175}
176
177sub extract_metadata_from_postscript {
178 my $self = shift (@_);
179
180 my ($filename,$doc) = @_;
181
182 my $section = $doc->get_top_section();
183
184 my $title_found = 0;
185 my $pages_found = 0;
186 my $date_found = 0;
187
188 print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
189 if $self->{'verbosity'} > 1;
190
191 open(INPUT, "<$filename");
192 my $date;
193
194 while(my $line =<INPUT>) {
195 if ($self->{'extract_title'} && !$title_found) {
196 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
197 my $new_word = $word;
198 $new_word =~ s/\(Untitled\)//i;
199 $new_word =~ s/\(Microsoft Word\)//i;
200 $new_word =~ s/Microsoft Word//i;
201 $new_word =~ s/^\(//i;
202 $new_word =~ s/\)$//i;
203 $new_word =~ s/^ - //i;
204 if ($new_word ne "") {
205 $doc->add_utf8_metadata($section, "Title", $new_word );
206 $title_found = 1;
207 }
208 }
209 }
210 if ($self->{'extract_date'} && !$date_found) {
211 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
212 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
213 $date = &sorttools::format_date($2,$1,$3);
214 if (defined $date) {
215 $doc->add_utf8_metadata($section, "Date", $date );
216 }
217 }
218 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
219 $date = &sorttools::format_date($3,$2,$1);
220 if (defined $date) {
221 $doc->add_utf8_metadata($section, "Date", $date );
222 }
223 }
224 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
225 $date = &sorttools::format_date($3,$2,$1);
226 if (defined $date) {
227 $doc->add_utf8_metadata($section, "Date", $date );
228 }
229 }
230 $date_found = 1;
231 }
232 }
233 if ($self->{'extract_pages'} && !$pages_found) {
234 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
235 my $digits = $word;
236 $digits =~ s/[^0-9]//g;
237 if ($digits ne "" && $digits ne "0") {
238 $doc->add_utf8_metadata($section, "Pages", $digits );
239 $pages_found = 1;
240 }
241 }
242 }
243 }
244}
245
246# do plugin specific processing of doc_obj
247sub process {
248 my $self = shift (@_);
249 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
250
251 my $filename = &util::filename_cat($base_dir,$file);
252 $self->extract_metadata_from_postscript($filename, $doc_obj);
253
254 return $self->SUPER::process(@_);
255
256}
257
258
2591;
260
Note: See TracBrowser for help on using the repository browser.