source: gsdl/trunk/perllib/plugins/PSPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# PSPlugin.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlugin;
29
30use ConvertBinaryFile;
31use ReadTextFile; # for read_file in convert_post_process. do we need it?
32use sorttools;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub BEGIN {
38 @PSPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
39}
40
41my $convert_to_list =
42 [ { 'name' => "auto",
43 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44 { 'name' => "text",
45 'desc' => "{ConvertBinaryFile.convert_to.text}" },
46 { 'name' => "pagedimg_jpg",
47 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
48 { 'name' => "pagedimg_gif",
49 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
50 { 'name' => "pagedimg_png",
51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
52 ];
53
54my $arguments =
55 [ { 'name' => "convert_to",
56 'desc' => "{ConvertBinaryFile.convert_to}",
57 'type' => "enum",
58 'reqd' => "yes",
59 'list' => $convert_to_list,
60 'deft' => "text" },
61 { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "regexp",
64 'deft' => &get_default_process_exp(),
65 'reqd' => "no" },
66 { 'name' => "block_exp",
67 'desc' => "{BasePlugin.block_exp}",
68 'type' => 'regexp',
69 'deft' => &get_default_block_exp() },
70 { 'name' => "extract_date",
71 'desc' => "{PSPlugin.extract_date}",
72 'type' => "flag" },
73 { 'name' => "extract_pages",
74 'desc' => "{PSPlugin.extract_pages}",
75 'type' => "flag" },
76 { 'name' => "extract_title",
77 'desc' => "{PSPlugin.extract_title}",
78 'type' => "flag" } ];
79
80my $options = { 'name' => "PSPlugin",
81 'desc' => "{PSPlugin.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
85 'args' => $arguments };
86
87sub new {
88 my ($class) = shift (@_);
89 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$pluginlist, $class);
91
92 #push(@$inputargs,"-convert_to");
93 #push(@$inputargs,"text");
94 push(@$inputargs,"-title_sub");
95 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
96
97 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98 push(@{$hashArgOptLists->{"OptList"}},$options);
99
100 my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101
102 if ($self->{'info_only'}) {
103 # don't worry about any options etc
104 return bless $self, $class;
105 }
106
107 $self->{'filename_extension'} = "ps";
108 $self->{'file_type'} = "PS";
109
110 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
111
112 if (!defined $secondary_plugin_options->{'TextPlugin'}) {
113 $secondary_plugin_options->{'TextPlugin'} = [];
114 }
115
116 my $text_options = $secondary_plugin_options->{'TextPlugin'};
117
118 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
119 if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
120 $secondary_plugin_options->{'PagedImagePlugin'} = [];
121 my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
122 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123 }
124 }
125 # following title_sub removes "Page 1" added by ps2ascii, and a leading
126 # "1", which is often the page number at the top of the page. Bad Luck
127 # if your document title actually starts with "1 " - is there a better way?
128 #$self->{'input_encoding'} = "utf8";
129 #$self->{'extract_language'} = 1;
130 push(@$text_options, "-input_encoding", "utf8");
131 push(@$text_options,"-extract_language") if $self->{'extract_language'};
132 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
133
134 $self = bless $self, $class;
135
136 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
137
138 return $self;
139}
140
141
142sub get_default_block_exp {
143 my $self = shift (@_);
144
145 return q^(?i)\.(eps)$^;
146}
147
148sub get_default_process_exp {
149 my $self = shift (@_);
150
151 return q^(?i)\.ps$^;
152}
153
154# this has been commented out in other plugins. do we need it here?
155sub convert_post_process
156{
157 my $self = shift (@_);
158 my ($conv_filename) = @_;
159
160 my $outhandle=$self->{'outhandle'};
161
162 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
163
164 # read in file ($text will be in utf8)
165 my $text = "";
166 $self->read_file ($conv_filename, $encoding, $language, \$text);
167
168 # turn any high bytes that aren't valid utf-8 into utf-8.
169 unicode::ensure_utf8(\$text);
170
171 # Write it out again!
172 $self->utf8_write_file (\$text, $conv_filename);
173}
174
175sub extract_metadata_from_postscript {
176 my $self = shift (@_);
177
178 my ($filename,$doc) = @_;
179
180 my $section = $doc->get_top_section();
181
182 my $title_found = 0;
183 my $pages_found = 0;
184 my $date_found = 0;
185
186 print STDERR "PSPlugin: extracting PostScript metadata from \"$filename\"\n"
187 if $self->{'verbosity'} > 1;
188
189 open(INPUT, "<$filename");
190 my $date;
191
192 while(my $line =<INPUT>) {
193 if ($self->{'extract_title'} && !$title_found) {
194 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
195 my $new_word = $word;
196 $new_word =~ s/\(Untitled\)//i;
197 $new_word =~ s/\(Microsoft Word\)//i;
198 $new_word =~ s/Microsoft Word//i;
199 $new_word =~ s/^\(//i;
200 $new_word =~ s/\)$//i;
201 $new_word =~ s/^ - //i;
202 if ($new_word ne "") {
203 $doc->add_utf8_metadata($section, "Title", $new_word );
204 $title_found = 1;
205 }
206 }
207 }
208 if ($self->{'extract_date'} && !$date_found) {
209 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
210 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
211 $date = &sorttools::format_date($2,$1,$3);
212 if (defined $date) {
213 $doc->add_utf8_metadata($section, "Date", $date );
214 }
215 }
216 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
217 $date = &sorttools::format_date($3,$2,$1);
218 if (defined $date) {
219 $doc->add_utf8_metadata($section, "Date", $date );
220 }
221 }
222 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
223 $date = &sorttools::format_date($3,$2,$1);
224 if (defined $date) {
225 $doc->add_utf8_metadata($section, "Date", $date );
226 }
227 }
228 $date_found = 1;
229 }
230 }
231 if ($self->{'extract_pages'} && !$pages_found) {
232 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
233 my $digits = $word;
234 $digits =~ s/[^0-9]//g;
235 if ($digits ne "" && $digits ne "0") {
236 $doc->add_utf8_metadata($section, "Pages", $digits );
237 $pages_found = 1;
238 }
239 }
240 }
241 }
242}
243
244# do plugin specific processing of doc_obj
245sub process {
246 my $self = shift (@_);
247 my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
248
249 my $filename = &util::filename_cat($base_dir,$file);
250 $self->extract_metadata_from_postscript($filename, $doc_obj);
251
252 return $self->SUPER::process(@_);
253
254}
255
256
2571;
258
Note: See TracBrowser for help on using the repository browser.