source: gsdl/trunk/perllib/plugins/PSPlugin.pm@ 15865

Last change on this file since 15865 was 15865, checked in by kjdon, 16 years ago

renaming plugins in preparation for my plugin overhaul

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36sub BEGIN {
37 @PSPlug::ISA = ('ConvertToPlug');
38}
39
40my $convert_to_list =
41 [ { 'name' => "auto",
42 'desc' => "{ConvertToPlug.convert_to.auto}" },
43 { 'name' => "text",
44 'desc' => "{ConvertToPlug.convert_to.text}" },
45 { 'name' => "pagedimg_jpg",
46 'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}" },
47 { 'name' => "pagedimg_gif",
48 'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}" },
49 { 'name' => "pagedimg_png",
50 'desc' => "{ConvertToPlug.convert_to.pagedimg_png}" }
51 ];
52
53my $arguments =
54 [ { 'name' => "convert_to",
55 'desc' => "{ConvertToPlug.convert_to}",
56 'type' => "enum",
57 'reqd' => "yes",
58 'list' => $convert_to_list,
59 'deft' => "text" },
60 { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "block_exp",
66 'desc' => "{BasPlug.block_exp}",
67 'type' => 'regexp',
68 'deft' => &get_default_block_exp() },
69 { 'name' => "extract_date",
70 'desc' => "{PSPlug.extract_date}",
71 'type' => "flag" },
72 { 'name' => "extract_pages",
73 'desc' => "{PSPlug.extract_pages}",
74 'type' => "flag" },
75 { 'name' => "extract_title",
76 'desc' => "{PSPlug.extract_title}",
77 'type' => "flag" } ];
78
79my $options = { 'name' => "PSPlug",
80 'desc' => "{PSPlug.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
84 'args' => $arguments };
85
86sub new {
87 my ($class) = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
89 push(@$pluginlist, $class);
90
91 #push(@$inputargs,"-convert_to");
92 #push(@$inputargs,"text");
93 push(@$inputargs,"-title_sub");
94 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
95
96 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
97 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
98
99 my $self = new ConvertToPlug($pluginlist, $inputargs, $hashArgOptLists);
100
101 if ($self->{'info_only'}) {
102 # don't worry about any options etc
103 return bless $self, $class;
104 }
105
106 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
107
108 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
109 $secondary_plugin_options->{'TEXTPlug'} = [];
110 }
111
112 my $text_options = $secondary_plugin_options->{'TEXTPlug'};
113
114 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
115 if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
116 $secondary_plugin_options->{'PagedImgPlug'} = [];
117 my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
118 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
119 }
120 }
121 # following title_sub removes "Page 1" added by ps2ascii, and a leading
122 # "1", which is often the page number at the top of the page. Bad Luck
123 # if your document title actually starts with "1 " - is there a better way?
124 #$self->{'input_encoding'} = "utf8";
125 #$self->{'extract_language'} = 1;
126 push(@$text_options, "-input_encoding", "utf8");
127 push(@$text_options,"-extract_language") if $self->{'extract_language'};
128 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
129
130 $self = bless $self, $class;
131
132 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
133
134 return $self;
135}
136
137
138sub get_default_block_exp {
139 my $self = shift (@_);
140
141 return q^(?i)\.(eps)$^;
142}
143
144sub get_default_process_exp {
145 my $self = shift (@_);
146
147 return q^(?i)\.ps$^;
148}
149
150sub convert_post_process
151{
152 my $self = shift (@_);
153 my ($conv_filename) = @_;
154
155 my $outhandle=$self->{'outhandle'};
156
157 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
158
159 # read in file ($text will be in utf8)
160 my $text = "";
161 $self->read_file ($conv_filename, $encoding, $language, \$text);
162
163 # turn any high bytes that aren't valid utf-8 into utf-8.
164 unicode::ensure_utf8(\$text);
165
166 # Write it out again!
167 $self->utf8_write_file (\$text, $conv_filename);
168}
169
170sub extract_metadata_from_postscript {
171 my $self = shift (@_);
172
173 my ($filename,$doc) = @_;
174
175 my $section = $doc->get_top_section();
176
177 my $title_found = 0;
178 my $pages_found = 0;
179 my $date_found = 0;
180
181 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
182 if $self->{'verbosity'} > 1;
183
184 open(INPUT, "<$filename");
185 my $date;
186
187 while(my $line =<INPUT>) {
188 if ($self->{'extract_title'} && !$title_found) {
189 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
190 my $new_word = $word;
191 $new_word =~ s/\(Untitled\)//i;
192 $new_word =~ s/\(Microsoft Word\)//i;
193 $new_word =~ s/Microsoft Word//i;
194 $new_word =~ s/^\(//i;
195 $new_word =~ s/\)$//i;
196 $new_word =~ s/^ - //i;
197 if ($new_word ne "") {
198 $doc->add_utf8_metadata($section, "Title", $new_word );
199 $title_found = 1;
200 }
201 }
202 }
203 if ($self->{'extract_date'} && !$date_found) {
204 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
205 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
206 $date = &sorttools::format_date($2,$1,$3);
207 if (defined $date) {
208 $doc->add_utf8_metadata($section, "Date", $date );
209 }
210 }
211 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
212 $date = &sorttools::format_date($3,$2,$1);
213 if (defined $date) {
214 $doc->add_utf8_metadata($section, "Date", $date );
215 }
216 }
217 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
218 $date = &sorttools::format_date($3,$2,$1);
219 if (defined $date) {
220 $doc->add_utf8_metadata($section, "Date", $date );
221 }
222 }
223 $date_found = 1;
224 }
225 }
226 if ($self->{'extract_pages'} && !$pages_found) {
227 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
228 my $digits = $word;
229 $digits =~ s/[^0-9]//g;
230 if ($digits ne "" && $digits ne "0") {
231 $doc->add_utf8_metadata($section, "Pages", $digits );
232 $pages_found = 1;
233 }
234 }
235 }
236 }
237}
238
239# do plugin specific processing of doc_obj for HTML type
240sub process {
241 my $self = shift (@_);
242 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
243
244# my $outhandle = $self->{'outhandle'};
245
246# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
247# if $self->{'verbosity'} > 1;
248# print STDERR "<Processing n='$file' p='PSPlug'>\n" if ($gli);
249
250 my $filename = &util::filename_cat($base_dir,$file);
251 $self->extract_metadata_from_postscript($filename, $doc_obj);
252
253 return $self->process_type("ps",$base_dir,$file,$doc_obj);
254}
255
256
2571;
258
Note: See TracBrowser for help on using the repository browser.