source: gsdl/trunk/perllib/plugins/PostScriptPlugin.pm@ 18406

Last change on this file since 18406 was 18406, checked in by ak19, 15 years ago

Modified srcreplaceable plugins (plugins which operate on docs where the source file can be replaced with their converted htmls) to set the file_rename_method to none for secondary plugins (for Text, HTML, and PagedImage plugins) so that the file is not renamed several times.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# PostScriptPlugin.pm -- plugin to process PostScript files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PostScriptPlugin;
27
28use ConvertBinaryFile;
29use ReadTextFile; # for read_file in convert_post_process. do we need it?
30use sorttools;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35sub BEGIN {
36 @PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37}
38
39my $convert_to_list =
40 [ { 'name' => "auto",
41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42 { 'name' => "text",
43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
44 { 'name' => "pagedimg_jpg",
45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46 { 'name' => "pagedimg_gif",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48 { 'name' => "pagedimg_png",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50 ];
51
52my $arguments =
53 [ { 'name' => "convert_to",
54 'desc' => "{ConvertBinaryFile.convert_to}",
55 'type' => "enum",
56 'reqd' => "yes",
57 'list' => $convert_to_list,
58 'deft' => "text" },
59 { 'name' => "process_exp",
60 'desc' => "{BasePlugin.process_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_process_exp(),
63 'reqd' => "no" },
64 { 'name' => "block_exp",
65 'desc' => "{BasePlugin.block_exp}",
66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "extract_date",
69 'desc' => "{PostScriptPlugin.extract_date}",
70 'type' => "flag" },
71 { 'name' => "extract_pages",
72 'desc' => "{PostScriptPlugin.extract_pages}",
73 'type' => "flag" },
74 { 'name' => "extract_title",
75 'desc' => "{PostScriptPlugin.extract_title}",
76 'type' => "flag" } ];
77
78my $options = { 'name' => "PostScriptPlugin",
79 'desc' => "{PostScriptPlugin.desc}",
80 'abstract' => "no",
81 'inherits' => "yes",
82 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 #push(@$inputargs,"-convert_to");
91 #push(@$inputargs,"text");
92 push(@$inputargs,"-title_sub");
93 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96 push(@{$hashArgOptLists->{"OptList"}},$options);
97
98 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 if ($self->{'info_only'}) {
101 # don't worry about any options etc
102 return bless $self, $class;
103 }
104
105 $self->{'filename_extension'} = "ps";
106 $self->{'file_type'} = "PS";
107
108 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
109
110 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
111 $secondary_plugin_options->{'TextPlugin'} = [];
112 }
113
114 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
115 $secondary_plugin_options->{'HTMLPlugin'} = [];
116 }
117
118 my $text_options = $secondary_plugin_options->{'TextPlugin'};
119 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
120
121 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
122 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
123 $secondary_plugin_options->{'PagedImagePlugin'} = [];
124 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
125 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126 push(@$pagedimg_options, "-file_rename_method", "none");
127 }
128 }
129 # following title_sub removes "Page 1" added by ps2ascii, and a leading
130 # "1", which is often the page number at the top of the page. Bad Luck
131 # if your document title actually starts with "1 " - is there a better way?
132 #$self->{'input_encoding'} = "utf8";
133 #$self->{'extract_language'} = 1;
134 push(@$text_options, "-input_encoding", "utf8");
135 push(@$text_options,"-extract_language") if $self->{'extract_language'};
136 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
137
138 push(@$text_options, "-file_rename_method", "none");
139 push(@$html_options, "-file_rename_method", "none");
140
141 $self = bless $self, $class;
142 # used for convert_post_process
143 $self->{'input_encoding'} = "auto";
144 $self->{'default_encoding'} = "utf8";
145
146 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
147
148 return $self;
149}
150
151
152sub get_default_block_exp {
153 my $self = shift (@_);
154
155 return q^(?i)\.(eps)$^;
156}
157
158sub get_default_process_exp {
159 my $self = shift (@_);
160
161 return q^(?i)\.ps$^;
162}
163
164# this has been commented out in other plugins. do we need it here?
165# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
166sub convert_post_process
167{
168 my $self = shift (@_);
169 my ($conv_filename) = @_;
170
171 my $outhandle=$self->{'outhandle'};
172
173 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
174
175 # read in file ($text will be in utf8)
176 my $text = "";
177 $self->read_file ($conv_filename, $encoding, $language, \$text);
178
179 # turn any high bytes that aren't valid utf-8 into utf-8.
180 unicode::ensure_utf8(\$text);
181
182 # Write it out again!
183 $self->utf8_write_file (\$text, $conv_filename);
184}
185
186sub extract_metadata_from_postscript {
187 my $self = shift (@_);
188
189 my ($filename,$doc) = @_;
190
191 my $section = $doc->get_top_section();
192
193 my $title_found = 0;
194 my $pages_found = 0;
195 my $date_found = 0;
196
197 print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
198 if $self->{'verbosity'} > 1;
199
200 open(INPUT, "<$filename");
201 my $date;
202
203 while(my $line =<INPUT>) {
204 if ($self->{'extract_title'} && !$title_found) {
205 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
206 my $new_word = $word;
207 $new_word =~ s/\(Untitled\)//i;
208 $new_word =~ s/\(Microsoft Word\)//i;
209 $new_word =~ s/Microsoft Word//i;
210 $new_word =~ s/^\(//i;
211 $new_word =~ s/\)$//i;
212 $new_word =~ s/^ - //i;
213 if ($new_word ne "") {
214 $doc->add_utf8_metadata($section, "Title", $new_word );
215 $title_found = 1;
216 }
217 }
218 }
219 if ($self->{'extract_date'} && !$date_found) {
220 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
221 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
222 $date = &sorttools::format_date($2,$1,$3);
223 if (defined $date) {
224 $doc->add_utf8_metadata($section, "Date", $date );
225 }
226 }
227 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
228 $date = &sorttools::format_date($3,$2,$1);
229 if (defined $date) {
230 $doc->add_utf8_metadata($section, "Date", $date );
231 }
232 }
233 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
234 $date = &sorttools::format_date($3,$2,$1);
235 if (defined $date) {
236 $doc->add_utf8_metadata($section, "Date", $date );
237 }
238 }
239 $date_found = 1;
240 }
241 }
242 if ($self->{'extract_pages'} && !$pages_found) {
243 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
244 my $digits = $word;
245 $digits =~ s/[^0-9]//g;
246 if ($digits ne "" && $digits ne "0") {
247 $doc->add_utf8_metadata($section, "Pages", $digits );
248 $pages_found = 1;
249 }
250 }
251 }
252 }
253}
254
255# do plugin specific processing of doc_obj
256sub process {
257 my $self = shift (@_);
258 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
259
260 my $filename = &util::filename_cat($base_dir,$file);
261 $self->extract_metadata_from_postscript($filename, $doc_obj);
262
263 return $self->SUPER::process(@_);
264
265}
266
267
2681;
269
Note: See TracBrowser for help on using the repository browser.