source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 10276

Last change on this file since 10276 was 10274, checked in by chi, 19 years ago

A modification to allow a secondary plug setting through ConvertToPlug.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36sub BEGIN {
37 @PSPlug::ISA = ('ConvertToPlug');
38}
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "block_exp",
47 'desc' => "{BasPlug.block_exp}",
48 'type' => 'regexp',
49 'deft' => &get_default_block_exp() },
50 { 'name' => "extract_date",
51 'desc' => "{PSPlug.extract_date}",
52 'type' => "flag" },
53 { 'name' => "extract_pages",
54 'desc' => "{PSPlug.extract_pages}",
55 'type' => "flag" },
56 { 'name' => "extract_title",
57 'desc' => "{PSPlug.extract_title}",
58 'type' => "flag" } ];
59
60my $options = { 'name' => "PSPlug",
61 'desc' => "{PSPlug.desc}",
62 'abstract' => "no",
63 'inherits' => "yes",
64 'args' => $arguments };
65
66sub new {
67 my ($class) = shift (@_);
68 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
69 push(@$pluginlist, $class);
70
71 push(@$inputargs,"-convert_to");
72 push(@$inputargs,"text");
73 push(@$inputargs,"-title_sub");
74 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
75
76 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
77 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
78
79 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
80
81
82 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
83 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
84 $secondary_plugin_options->{'TEXTPlug'} = [];
85 }
86
87 my $text_options = $secondary_plugin_options->{'TEXTPlug'};
88
89 # following title_sub removes "Page 1" added by ps2ascii, and a leading
90 # "1", which is often the page number at the top of the page. Bad Luck
91 # if your document title actually starts with "1 " - is there a better way?
92 #$self->{'input_encoding'} = "utf8";
93 #$self->{'extract_language'} = 1;
94 #push(@$text_options, "-input_encoding", "utf8");
95 #push(@$text_options,"-extract_language");
96 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
97
98 $self = bless $self, $class;
99
100 $self->load_secondary_plugins($class,$secondary_plugin_options);
101
102 return $self;
103}
104
105
106sub get_default_block_exp {
107 my $self = shift (@_);
108
109 return q^(?i)\.(eps)$^;
110}
111
112sub get_default_process_exp {
113 my $self = shift (@_);
114
115 return q^(?i)\.ps$^;
116}
117
118sub convert_post_process
119{
120 my $self = shift (@_);
121 my ($conv_filename) = @_;
122
123 my $outhandle=$self->{'outhandle'};
124
125 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
126
127# print STDERR "*** Plug encoding = $encoding\n";
128
129 # read in file ($text will be in utf8)
130 my $text = "";
131 $self->read_file ($conv_filename, $encoding, $language, \$text);
132
133 # turn any high bytes that aren't valid utf-8 into utf-8.
134 unicode::ensure_utf8(\$text);
135
136 #print STDERR "*** Text =$text\n";
137 # Write it out again!
138 $self->utf8_write_file (\$text, $conv_filename);
139}
140
141sub extract_metadata_from_postscript {
142 my $self = shift (@_);
143
144 my ($filename,$doc) = @_;
145
146 my $section = $doc->get_top_section();
147
148 my $title_found = 0;
149 my $pages_found = 0;
150 my $date_found = 0;
151
152 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
153 if $self->{'verbosity'} > 1;
154
155 open(INPUT, "<$filename");
156 my $date;
157
158 while(my $line =<INPUT>) {
159 if ($self->{'extract_title'} && !$title_found) {
160 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
161 my $new_word = $word;
162 $new_word =~ s/\(Untitled\)//i;
163 $new_word =~ s/\(Microsoft Word\)//i;
164 $new_word =~ s/Microsoft Word//i;
165 $new_word =~ s/^\(//i;
166 $new_word =~ s/\)$//i;
167 $new_word =~ s/^ - //i;
168 if ($new_word ne "") {
169 $doc->add_utf8_metadata($section, "Title", $new_word );
170 $title_found = 1;
171 }
172 }
173 }
174 if ($self->{'extract_date'} && !$date_found) {
175 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
176 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
177 $date = &sorttools::format_date($2,$1,$3);
178 if (defined $date) {
179 $doc->add_utf8_metadata($section, "Date", $date );
180 }
181 }
182 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
183 $date = &sorttools::format_date($3,$2,$1);
184 if (defined $date) {
185 $doc->add_utf8_metadata($section, "Date", $date );
186 }
187 }
188 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
189 $date = &sorttools::format_date($3,$2,$1);
190 if (defined $date) {
191 $doc->add_utf8_metadata($section, "Date", $date );
192 }
193 }
194 $date_found = 1;
195 }
196 }
197 if ($self->{'extract_pages'} && !$pages_found) {
198 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
199 my $digits = $word;
200 $digits =~ s/[^0-9]//g;
201 if ($digits ne "" && $digits ne "0") {
202 $doc->add_utf8_metadata($section, "Pages", $digits );
203 $pages_found = 1;
204 }
205 }
206 }
207 }
208}
209
210# do plugin specific processing of doc_obj for HTML type
211sub process {
212 my $self = shift (@_);
213 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
214
215# my $outhandle = $self->{'outhandle'};
216
217# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
218# if $self->{'verbosity'} > 1;
219# print STDERR "<Processing n='$file' p='PSPlug'>\n" if ($gli);
220
221 my $filename = &util::filename_cat($base_dir,$file);
222 $self->extract_metadata_from_postscript($filename, $doc_obj);
223
224 return $self->process_type("ps",$base_dir,$file,$doc_obj);
225}
226
227
2281;
229
Note: See TracBrowser for help on using the repository browser.