source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 21764

Last change on this file since 21764 was 20790, checked in by kjdon, 15 years ago

set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1###########################################################################
2#
3# PostScriptPlugin.pm -- plugin to process PostScript files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PostScriptPlugin;
27
28use ConvertBinaryFile;
29use ReadTextFile; # for read_file in convert_post_process. do we need it?
30use sorttools;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35sub BEGIN {
36 @PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37}
38
39my $convert_to_list =
40 [ { 'name' => "auto",
41 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42 { 'name' => "text",
43 'desc' => "{ConvertBinaryFile.convert_to.text}" },
44 { 'name' => "pagedimg_jpg",
45 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46 { 'name' => "pagedimg_gif",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48 { 'name' => "pagedimg_png",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50 ];
51
52my $arguments =
53 [ { 'name' => "convert_to",
54 'desc' => "{ConvertBinaryFile.convert_to}",
55 'type' => "enum",
56 'reqd' => "yes",
57 'list' => $convert_to_list,
58 'deft' => "text" },
59 { 'name' => "process_exp",
60 'desc' => "{BasePlugin.process_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_process_exp(),
63 'reqd' => "no" },
64 { 'name' => "block_exp",
65 'desc' => "{BasePlugin.block_exp}",
66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "extract_date",
69 'desc' => "{PostScriptPlugin.extract_date}",
70 'type' => "flag" },
71 { 'name' => "extract_pages",
72 'desc' => "{PostScriptPlugin.extract_pages}",
73 'type' => "flag" },
74 { 'name' => "extract_title",
75 'desc' => "{PostScriptPlugin.extract_title}",
76 'type' => "flag" } ];
77
78my $options = { 'name' => "PostScriptPlugin",
79 'desc' => "{PostScriptPlugin.desc}",
80 'abstract' => "no",
81 'inherits' => "yes",
82 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 #push(@$inputargs,"-convert_to");
91 #push(@$inputargs,"text");
92 push(@$inputargs,"-title_sub");
93 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96 push(@{$hashArgOptLists->{"OptList"}},$options);
97
98 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100 if ($self->{'info_only'}) {
101 # don't worry about any options etc
102 return bless $self, $class;
103 }
104
105 $self->{'filename_extension'} = "ps";
106 $self->{'file_type'} = "PS";
107
108 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
109
110 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
111 $secondary_plugin_options->{'TextPlugin'} = [];
112 }
113
114 if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
115 $secondary_plugin_options->{'HTMLPlugin'} = [];
116 }
117
118 my $text_options = $secondary_plugin_options->{'TextPlugin'};
119 my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
120
121 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
122 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
123 $secondary_plugin_options->{'PagedImagePlugin'} = [];
124 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
125 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126 push(@$pagedimg_options, "-file_rename_method", "none");
127 push(@$pagedimg_options, "-processing_tmp_files");
128 }
129 }
130 # following title_sub removes "Page 1" added by ps2ascii, and a leading
131 # "1", which is often the page number at the top of the page. Bad Luck
132 # if your document title actually starts with "1 " - is there a better way?
133 #$self->{'input_encoding'} = "utf8";
134 #$self->{'extract_language'} = 1;
135 push(@$text_options, "-input_encoding", "utf8");
136 push(@$text_options,"-extract_language") if $self->{'extract_language'};
137 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
138
139 push(@$text_options, "-file_rename_method", "none");
140 push(@$html_options, "-file_rename_method", "none");
141
142 # tell the secondary plugins that they are processing tmp files
143 push(@$html_options, "-processing_tmp_files");
144
145 $self = bless $self, $class;
146 # used for convert_post_process
147 $self->{'input_encoding'} = "auto";
148 $self->{'default_encoding'} = "utf8";
149
150 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
151
152 return $self;
153}
154
155
156sub get_default_block_exp {
157 my $self = shift (@_);
158
159 return q^(?i)\.(eps)$^;
160}
161
162sub get_default_process_exp {
163 my $self = shift (@_);
164
165 return q^(?i)\.ps$^;
166}
167
168# this has been commented out in other plugins. do we need it here?
169# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
170sub convert_post_process
171{
172 my $self = shift (@_);
173 my ($conv_filename) = @_;
174
175 my $outhandle=$self->{'outhandle'};
176
177 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
178
179 # read in file ($text will be in utf8)
180 my $text = "";
181 $self->read_file ($conv_filename, $encoding, $language, \$text);
182
183 # turn any high bytes that aren't valid utf-8 into utf-8.
184 unicode::ensure_utf8(\$text);
185
186 # Write it out again!
187 $self->utf8_write_file (\$text, $conv_filename);
188}
189
190sub extract_metadata_from_postscript {
191 my $self = shift (@_);
192
193 my ($filename,$doc) = @_;
194
195 my $section = $doc->get_top_section();
196
197 my $title_found = 0;
198 my $pages_found = 0;
199 my $date_found = 0;
200
201 print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
202 if $self->{'verbosity'} > 1;
203
204 open(INPUT, "<$filename");
205 my $date;
206
207 while(my $line =<INPUT>) {
208 if ($self->{'extract_title'} && !$title_found) {
209 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
210 my $new_word = $word;
211 $new_word =~ s/\(Untitled\)//i;
212 $new_word =~ s/\(Microsoft Word\)//i;
213 $new_word =~ s/Microsoft Word//i;
214 $new_word =~ s/^\(//i;
215 $new_word =~ s/\)$//i;
216 $new_word =~ s/^ - //i;
217 if ($new_word ne "") {
218 $doc->add_utf8_metadata($section, "Title", $new_word );
219 $title_found = 1;
220 }
221 }
222 }
223 if ($self->{'extract_date'} && !$date_found) {
224 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
225 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
226 $date = &sorttools::format_date($2,$1,$3);
227 if (defined $date) {
228 $doc->add_utf8_metadata($section, "Date", $date );
229 }
230 }
231 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
232 $date = &sorttools::format_date($3,$2,$1);
233 if (defined $date) {
234 $doc->add_utf8_metadata($section, "Date", $date );
235 }
236 }
237 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
238 $date = &sorttools::format_date($3,$2,$1);
239 if (defined $date) {
240 $doc->add_utf8_metadata($section, "Date", $date );
241 }
242 }
243 $date_found = 1;
244 }
245 }
246 if ($self->{'extract_pages'} && !$pages_found) {
247 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
248 my $digits = $word;
249 $digits =~ s/[^0-9]//g;
250 if ($digits ne "" && $digits ne "0") {
251 $doc->add_utf8_metadata($section, "Pages", $digits );
252 $pages_found = 1;
253 }
254 }
255 }
256 }
257}
258
259# do plugin specific processing of doc_obj
260sub process {
261 my $self = shift (@_);
262 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264 my $filename = &util::filename_cat($base_dir,$file);
265 $self->extract_metadata_from_postscript($filename, $doc_obj);
266
267 return $self->SUPER::process(@_);
268
269}
270
271
2721;
273
Note: See TracBrowser for help on using the repository browser.