source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 10537

Last change on this file since 10537 was 10536, checked in by chi, 19 years ago

Modification of adding pagedimg types of conversion for PS documents. So now PS documents can be converted to TEXT, PAGEDIMG_JPG, PAGEDIMG_GIF, PAGEDIMG_GIF types.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36sub BEGIN {
37 @PSPlug::ISA = ('ConvertToPlug');
38}
39
40my $convert_to_list =
41 [ { 'name' => "auto",
42 'desc' => "{ConvertToPlug.convert_to.auto}" },
43 { 'name' => "text",
44 'desc' => "{ConvertToPlug.convert_to.text}" },
45 { 'name' => "pagedimg_jpg",
46 'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}" },
47 { 'name' => "pagedimg_gif",
48 'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}" },
49 { 'name' => "pagedimg_png",
50 'desc' => "{ConvertToPlug.convert_to.pagedimg_png}" }
51 ];
52
53my $arguments =
54 [ { 'name' => "convert_to",
55 'desc' => "{ConvertToPlug.convert_to}",
56 'type' => "enum",
57 'reqd' => "yes",
58 'list' => $convert_to_list,
59 'deft' => "html" },
60 { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "block_exp",
66 'desc' => "{BasPlug.block_exp}",
67 'type' => 'regexp',
68 'deft' => &get_default_block_exp() },
69 { 'name' => "extract_date",
70 'desc' => "{PSPlug.extract_date}",
71 'type' => "flag" },
72 { 'name' => "extract_pages",
73 'desc' => "{PSPlug.extract_pages}",
74 'type' => "flag" },
75 { 'name' => "extract_title",
76 'desc' => "{PSPlug.extract_title}",
77 'type' => "flag" } ];
78
79my $options = { 'name' => "PSPlug",
80 'desc' => "{PSPlug.desc}",
81 'abstract' => "no",
82 'inherits' => "yes",
83 'args' => $arguments };
84
85sub new {
86 my ($class) = shift (@_);
87 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88 push(@$pluginlist, $class);
89
90 #push(@$inputargs,"-convert_to");
91 #push(@$inputargs,"text");
92 push(@$inputargs,"-title_sub");
93 push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
96 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
97
98 my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
99
100 my $secondary_plugin_options = $self->{'secondary_plugin_options'};
101
102 if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
103 $secondary_plugin_options->{'TEXTPlug'} = [];
104 }
105
106 my $text_options = $secondary_plugin_options->{'TEXTPlug'};
107
108 if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
109 if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
110 $secondary_plugin_options->{'PagedImgPlug'} = [];
111 my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
112 push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
113 }
114 }
115 # following title_sub removes "Page 1" added by ps2ascii, and a leading
116 # "1", which is often the page number at the top of the page. Bad Luck
117 # if your document title actually starts with "1 " - is there a better way?
118 #$self->{'input_encoding'} = "utf8";
119 #$self->{'extract_language'} = 1;
120 push(@$text_options, "-input_encoding", "utf8");
121 push(@$text_options,"-extract_language");
122 push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123
124 $self = bless $self, $class;
125
126 $self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
127
128 return $self;
129}
130
131
132sub get_default_block_exp {
133 my $self = shift (@_);
134
135 return q^(?i)\.(eps)$^;
136}
137
138sub get_default_process_exp {
139 my $self = shift (@_);
140
141 return q^(?i)\.ps$^;
142}
143
144sub convert_post_process
145{
146 my $self = shift (@_);
147 my ($conv_filename) = @_;
148
149 my $outhandle=$self->{'outhandle'};
150
151 my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
152
153 # read in file ($text will be in utf8)
154 my $text = "";
155 $self->read_file ($conv_filename, $encoding, $language, \$text);
156
157 # turn any high bytes that aren't valid utf-8 into utf-8.
158 unicode::ensure_utf8(\$text);
159
160 # Write it out again!
161 $self->utf8_write_file (\$text, $conv_filename);
162}
163
164sub extract_metadata_from_postscript {
165 my $self = shift (@_);
166
167 my ($filename,$doc) = @_;
168
169 my $section = $doc->get_top_section();
170
171 my $title_found = 0;
172 my $pages_found = 0;
173 my $date_found = 0;
174
175 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
176 if $self->{'verbosity'} > 1;
177
178 open(INPUT, "<$filename");
179 my $date;
180
181 while(my $line =<INPUT>) {
182 if ($self->{'extract_title'} && !$title_found) {
183 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
184 my $new_word = $word;
185 $new_word =~ s/\(Untitled\)//i;
186 $new_word =~ s/\(Microsoft Word\)//i;
187 $new_word =~ s/Microsoft Word//i;
188 $new_word =~ s/^\(//i;
189 $new_word =~ s/\)$//i;
190 $new_word =~ s/^ - //i;
191 if ($new_word ne "") {
192 $doc->add_utf8_metadata($section, "Title", $new_word );
193 $title_found = 1;
194 }
195 }
196 }
197 if ($self->{'extract_date'} && !$date_found) {
198 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
199 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
200 $date = &sorttools::format_date($2,$1,$3);
201 if (defined $date) {
202 $doc->add_utf8_metadata($section, "Date", $date );
203 }
204 }
205 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
206 $date = &sorttools::format_date($3,$2,$1);
207 if (defined $date) {
208 $doc->add_utf8_metadata($section, "Date", $date );
209 }
210 }
211 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
212 $date = &sorttools::format_date($3,$2,$1);
213 if (defined $date) {
214 $doc->add_utf8_metadata($section, "Date", $date );
215 }
216 }
217 $date_found = 1;
218 }
219 }
220 if ($self->{'extract_pages'} && !$pages_found) {
221 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
222 my $digits = $word;
223 $digits =~ s/[^0-9]//g;
224 if ($digits ne "" && $digits ne "0") {
225 $doc->add_utf8_metadata($section, "Pages", $digits );
226 $pages_found = 1;
227 }
228 }
229 }
230 }
231}
232
233# do plugin specific processing of doc_obj for HTML type
234sub process {
235 my $self = shift (@_);
236 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
237
238# my $outhandle = $self->{'outhandle'};
239
240# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
241# if $self->{'verbosity'} > 1;
242# print STDERR "<Processing n='$file' p='PSPlug'>\n" if ($gli);
243
244 my $filename = &util::filename_cat($base_dir,$file);
245 $self->extract_metadata_from_postscript($filename, $doc_obj);
246
247 return $self->process_type("ps",$base_dir,$file,$doc_obj);
248}
249
250
2511;
252
Note: See TracBrowser for help on using the repository browser.