source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 25971

Last change on this file since 25971 was 22874, checked in by kjdon, 14 years ago

no longer use filename_extension, as we should be using the original extension . eg when processing an odt doc with word plugin, the associated file used to be doc.doc, instead of doc.odt.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
RevLine 
[1685]1###########################################################################
2#
[17725]3# PostScriptPlugin.pm -- plugin to process PostScript files
[1685]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[17725]26package PostScriptPlugin;
[3540]27
[15872]28use ConvertBinaryFile;
29use ReadTextFile; # for read_file in convert_post_process. do we need it?
[1700]30use sorttools;
[1685]31
[10254]32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
[1685]35sub BEGIN {
[17725]36 @PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
[1685]37}
38
[10536]39my $convert_to_list =
40 [ { 'name' => "auto",
[15872]41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10536]42 { 'name' => "text",
[15872]43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10536]44 { 'name' => "pagedimg_jpg",
[15872]45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
[10536]46 { 'name' => "pagedimg_gif",
[15872]47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
[10536]48 { 'name' => "pagedimg_png",
[15872]49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
[10536]50 ];
51
[4744]52my $arguments =
[10536]53 [ { 'name' => "convert_to",
[15872]54 'desc' => "{ConvertBinaryFile.convert_to}",
[10536]55 'type' => "enum",
56 'reqd' => "yes",
57 'list' => $convert_to_list,
[10888]58 'deft' => "text" },
[10536]59 { 'name' => "process_exp",
[15872]60 'desc' => "{BasePlugin.process_exp}",
[6408]61 'type' => "regexp",
[4744]62 'deft' => &get_default_process_exp(),
63 'reqd' => "no" },
64 { 'name' => "block_exp",
[15872]65 'desc' => "{BasePlugin.block_exp}",
[6408]66 'type' => 'regexp',
[4744]67 'deft' => &get_default_block_exp() },
68 { 'name' => "extract_date",
[17744]69 'desc' => "{PostScriptPlugin.extract_date}",
[4744]70 'type' => "flag" },
71 { 'name' => "extract_pages",
[17744]72 'desc' => "{PostScriptPlugin.extract_pages}",
[4744]73 'type' => "flag" },
74 { 'name' => "extract_title",
[17744]75 'desc' => "{PostScriptPlugin.extract_title}",
[4744]76 'type' => "flag" } ];
[3540]77
[17725]78my $options = { 'name' => "PostScriptPlugin",
[17744]79 'desc' => "{PostScriptPlugin.desc}",
[6408]80 'abstract' => "no",
[3540]81 'inherits' => "yes",
[15114]82 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
[3540]83 'args' => $arguments };
84
[1685]85sub new {
[10218]86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
[1685]89
[10218]90 push(@$inputargs,"-title_sub");
91 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[10425]92
[15872]93 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94 push(@{$hashArgOptLists->{"OptList"}},$options);
[10425]95
[15872]96 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]97
[10580]98 if ($self->{'info_only'}) {
99 # don't worry about any options etc
100 return bless $self, $class;
101 }
102
[15872]103 $self->{'file_type'} = "PS";
104
[22597]105 if ($self->{'convert_to'} eq "auto") {
106 $self->{'convert_to'} = "text";
107 }
108
109 # set convert_to_plugin and convert_to_ext
[22860]110 $self->set_standard_convert_settings();
[22597]111 my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10274]112 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10536]113
[22597]114 if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
115 $secondary_plugin_options->{$secondary_plugin_name} = [];
[10274]116 }
[22597]117 my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10274]118
119 # following title_sub removes "Page 1" added by ps2ascii, and a leading
120 # "1", which is often the page number at the top of the page. Bad Luck
121 # if your document title actually starts with "1 " - is there a better way?
[22597]122 push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123 push(@$specific_options, "-file_rename_method", "none");
[18406]124
[22597]125 if ($secondary_plugin_name eq "TextPlugin") {
126 push(@$specific_options, "-input_encoding", "utf8");
127 push(@$specific_options,"-extract_language") if $self->{'extract_language'};
128 } elsif ($secondary_plugin_name eq "PagedImagePlugin") {
129 push(@$specific_options, "-processing_tmp_files");
130 }
[20790]131
[10274]132 $self = bless $self, $class;
[17330]133 # used for convert_post_process
134 $self->{'input_encoding'} = "auto";
135 $self->{'default_encoding'} = "utf8";
[10274]136
[10425]137 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
[10274]138
139 return $self;
[1685]140}
141
142
143sub get_default_block_exp {
144 my $self = shift (@_);
145
146 return q^(?i)\.(eps)$^;
147}
148
149sub get_default_process_exp {
150 my $self = shift (@_);
151
152 return q^(?i)\.ps$^;
153}
154
[15872]155# this has been commented out in other plugins. do we need it here?
[17330]156# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
[10274]157sub convert_post_process
158{
159 my $self = shift (@_);
160 my ($conv_filename) = @_;
161
162 my $outhandle=$self->{'outhandle'};
163
164 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
165
166 # read in file ($text will be in utf8)
167 my $text = "";
168 $self->read_file ($conv_filename, $encoding, $language, \$text);
169
170 # turn any high bytes that aren't valid utf-8 into utf-8.
171 unicode::ensure_utf8(\$text);
172
173 # Write it out again!
174 $self->utf8_write_file (\$text, $conv_filename);
175}
176
[1700]177sub extract_metadata_from_postscript {
178 my $self = shift (@_);
[10274]179
180 my ($filename,$doc) = @_;
181
[1700]182 my $section = $doc->get_top_section();
183
184 my $title_found = 0;
185 my $pages_found = 0;
186 my $date_found = 0;
187
[17725]188 print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
[1700]189 if $self->{'verbosity'} > 1;
190
191 open(INPUT, "<$filename");
192 my $date;
193
194 while(my $line =<INPUT>) {
195 if ($self->{'extract_title'} && !$title_found) {
196 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
[1706]197 my $new_word = $word;
198 $new_word =~ s/\(Untitled\)//i;
199 $new_word =~ s/\(Microsoft Word\)//i;
200 $new_word =~ s/Microsoft Word//i;
201 $new_word =~ s/^\(//i;
202 $new_word =~ s/\)$//i;
203 $new_word =~ s/^ - //i;
204 if ($new_word ne "") {
205 $doc->add_utf8_metadata($section, "Title", $new_word );
[1700]206 $title_found = 1;
207 }
208 }
209 }
210 if ($self->{'extract_date'} && !$date_found) {
211 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
212 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
213 $date = &sorttools::format_date($2,$1,$3);
[1712]214 if (defined $date) {
215 $doc->add_utf8_metadata($section, "Date", $date );
216 }
[1700]217 }
218 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
219 $date = &sorttools::format_date($3,$2,$1);
[1712]220 if (defined $date) {
221 $doc->add_utf8_metadata($section, "Date", $date );
222 }
[1700]223 }
224 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
225 $date = &sorttools::format_date($3,$2,$1);
[1712]226 if (defined $date) {
227 $doc->add_utf8_metadata($section, "Date", $date );
228 }
[1700]229 }
230 $date_found = 1;
231 }
232 }
233 if ($self->{'extract_pages'} && !$pages_found) {
234 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
235 my $digits = $word;
236 $digits =~ s/[^0-9]//g;
237 if ($digits ne "" && $digits ne "0") {
238 $doc->add_utf8_metadata($section, "Pages", $digits );
239 $pages_found = 1;
240 }
241 }
242 }
243 }
244}
245
[15872]246# do plugin specific processing of doc_obj
[1685]247sub process {
248 my $self = shift (@_);
[15872]249 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1685]250
[10274]251 my $filename = &util::filename_cat($base_dir,$file);
252 $self->extract_metadata_from_postscript($filename, $doc_obj);
253
[15872]254 return $self->SUPER::process(@_);
255
[1685]256}
257
258
2591;
260
Note: See TracBrowser for help on using the repository browser.