root/gsdl/trunk/perllib/plugins/PostScriptPlugin.pm @ 20790

Revision 20790, 8.7 KB (checked in by kjdon, 10 years ago)

set -processing_tmp_files option to secondary HTML and PagedImage? plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# PostScriptPlugin.pm -- plugin to process PostScript files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PostScriptPlugin;
27
28use ConvertBinaryFile;
29use ReadTextFile; # for read_file in convert_post_process. do we need it?
30use sorttools;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35sub BEGIN {
36    @PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37}
38
39my $convert_to_list =
40    [ { 'name' => "auto",
41    'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42      { 'name' => "text",
43    'desc' => "{ConvertBinaryFile.convert_to.text}" },
44      { 'name' => "pagedimg_jpg",
45    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46      { 'name' => "pagedimg_gif",
47    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48      { 'name' => "pagedimg_png",
49    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50      ];
51
52my $arguments =
53    [ { 'name' => "convert_to",
54    'desc' => "{ConvertBinaryFile.convert_to}",
55    'type' => "enum",
56    'reqd' => "yes",
57    'list' => $convert_to_list,
58    'deft' => "text" },
59      { 'name' => "process_exp",
60    'desc' => "{BasePlugin.process_exp}",
61    'type' => "regexp",
62    'deft' => &get_default_process_exp(),
63    'reqd' => "no" },
64      { 'name' => "block_exp",
65    'desc' => "{BasePlugin.block_exp}",
66    'type' => 'regexp',
67    'deft' => &get_default_block_exp() },
68      { 'name' => "extract_date",
69    'desc' => "{PostScriptPlugin.extract_date}",
70    'type' => "flag" },
71      { 'name' => "extract_pages",
72    'desc' => "{PostScriptPlugin.extract_pages}",
73    'type' => "flag" },
74      { 'name' => "extract_title",
75    'desc' => "{PostScriptPlugin.extract_title}",
76    'type' => "flag" } ];
77
78my $options = { 'name'     => "PostScriptPlugin",
79        'desc'     => "{PostScriptPlugin.desc}",
80        'abstract' => "no",
81        'inherits' => "yes",
82        'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83        'args'     => $arguments };
84
85sub new {
86    my ($class) = shift (@_);
87    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88    push(@$pluginlist, $class);
89
90    #push(@$inputargs,"-convert_to");
91    #push(@$inputargs,"text");
92    push(@$inputargs,"-title_sub");
93    push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94   
95    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96    push(@{$hashArgOptLists->{"OptList"}},$options);
97   
98    my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100    if ($self->{'info_only'}) {
101    # don't worry about any options etc
102    return bless $self, $class;
103    }
104
105    $self->{'filename_extension'} = "ps";
106    $self->{'file_type'} = "PS";
107
108    my $secondary_plugin_options = $self->{'secondary_plugin_options'};
109
110    if (!defined $secondary_plugin_options->{'TextPlugin'}) {
111    $secondary_plugin_options->{'TextPlugin'} = [];
112    }
113
114    if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
115    $secondary_plugin_options->{'HTMLPlugin'} = [];
116    }
117
118    my $text_options = $secondary_plugin_options->{'TextPlugin'};
119    my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
120
121    if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage|pagedimg).*/i) {
122    if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
123        $secondary_plugin_options->{'PagedImagePlugin'} = [];
124        my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
125        push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126        push(@$pagedimg_options, "-file_rename_method", "none");
127        push(@$pagedimg_options, "-processing_tmp_files");
128    }
129    }
130    # following title_sub removes "Page 1" added by ps2ascii, and a leading
131    # "1", which is often the page number at the top of the page. Bad Luck
132    # if your document title actually starts with "1 " - is there a better way?
133    #$self->{'input_encoding'} = "utf8";
134    #$self->{'extract_language'} = 1;
135    push(@$text_options, "-input_encoding", "utf8");
136    push(@$text_options,"-extract_language") if $self->{'extract_language'};
137    push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
138
139    push(@$text_options, "-file_rename_method", "none");
140    push(@$html_options, "-file_rename_method", "none");
141   
142    # tell the secondary plugins that they are processing tmp files
143    push(@$html_options, "-processing_tmp_files");
144
145    $self = bless $self, $class;
146    # used for convert_post_process
147    $self->{'input_encoding'} = "auto";
148    $self->{'default_encoding'} = "utf8";
149
150    $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
151
152    return $self;
153}
154
155
156sub get_default_block_exp {
157    my $self = shift (@_);
158
159    return q^(?i)\.(eps)$^;
160}
161
162sub get_default_process_exp {
163    my $self = shift (@_);
164
165    return q^(?i)\.ps$^;
166}
167
168# this has been commented out in other plugins. do we need it here?
169# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
170sub convert_post_process
171{
172    my $self = shift (@_);
173    my ($conv_filename) = @_;
174   
175    my $outhandle=$self->{'outhandle'};
176   
177    my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
178   
179    # read in file ($text will be in utf8)
180    my $text = "";
181    $self->read_file ($conv_filename, $encoding, $language, \$text);
182   
183    # turn any high bytes that aren't valid utf-8 into utf-8.
184    unicode::ensure_utf8(\$text);
185   
186    # Write it out again!
187    $self->utf8_write_file (\$text, $conv_filename);
188}
189
190sub extract_metadata_from_postscript {
191    my $self = shift (@_);
192
193    my ($filename,$doc) = @_;
194
195    my $section = $doc->get_top_section();
196
197    my $title_found = 0;
198    my $pages_found = 0;
199    my $date_found = 0;
200
201    print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
202    if $self->{'verbosity'} > 1;
203
204    open(INPUT, "<$filename");
205    my $date;
206
207    while(my $line =<INPUT>) {
208    if ($self->{'extract_title'} && !$title_found) {
209        foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
210        my $new_word = $word;
211        $new_word =~ s/\(Untitled\)//i;
212        $new_word =~ s/\(Microsoft Word\)//i;
213        $new_word =~ s/Microsoft Word//i;
214        $new_word =~ s/^\(//i;
215        $new_word =~ s/\)$//i;
216        $new_word =~ s/^ - //i;
217        if ($new_word ne "") {
218            $doc->add_utf8_metadata($section, "Title", $new_word );
219            $title_found = 1;
220        }
221        }
222    }
223    if ($self->{'extract_date'} && !$date_found) {
224            foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
225                if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9])  ?[0-9: ]+ ([0-9]{4})/) {
226                    $date = &sorttools::format_date($2,$1,$3);
227            if (defined $date) {
228            $doc->add_utf8_metadata($section, "Date", $date );
229            }
230                }
231                if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
232                    $date = &sorttools::format_date($3,$2,$1);
233            if (defined $date) {
234            $doc->add_utf8_metadata($section, "Date", $date );
235            }
236                }
237                if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
238                    $date = &sorttools::format_date($3,$2,$1);
239            if (defined $date) {
240            $doc->add_utf8_metadata($section, "Date", $date );
241            }
242                }
243        $date_found = 1;
244            }
245    }
246    if ($self->{'extract_pages'} && !$pages_found) {
247        foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
248                my $digits = $word;
249                $digits =~ s/[^0-9]//g;
250        if ($digits ne "" && $digits ne "0") {
251            $doc->add_utf8_metadata($section, "Pages", $digits );
252            $pages_found = 1;
253        }
254            }
255    }
256    }
257}
258
259# do plugin specific processing of doc_obj
260sub process {
261    my $self = shift (@_);
262    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264    my $filename = &util::filename_cat($base_dir,$file);
265    $self->extract_metadata_from_postscript($filename, $doc_obj);
266
267    return $self->SUPER::process(@_);
268
269}
270
271
2721;
273
Note: See TracBrowser for help on using the browser.