Context Navigation

source: gsdl/trunk/perllib/plugins/PostScriptPlugin.pm@ 20790

Last change on this file since 20790 was 20790, checked in by kjdon, 15 years ago
set -processing_tmp_files option to secondary HTML and PagedImage plugins so that the associated files in tmp are not stored as source associated files (used by incremental build to work out what needs reimporting)
Property svn:keywords set to `Author Date Id Revision`
File size: 8.7 KB

Line
1	###########################################################################
2	#
3	# PostScriptPlugin.pm -- plugin to process PostScript files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PostScriptPlugin;
27
28	use ConvertBinaryFile;
29	use ReadTextFile; # for read_file in convert_post_process. do we need it?
30	use sorttools;
31
32	use strict;
33	no strict 'refs'; # allow filehandles to be variables and viceversa
34
35	sub BEGIN {
36	@PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37	}
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50	];
51
52	my $arguments =
53	[ { 'name' => "convert_to",
54	'desc' => "{ConvertBinaryFile.convert_to}",
55	'type' => "enum",
56	'reqd' => "yes",
57	'list' => $convert_to_list,
58	'deft' => "text" },
59	{ 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp(),
63	'reqd' => "no" },
64	{ 'name' => "block_exp",
65	'desc' => "{BasePlugin.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "extract_date",
69	'desc' => "{PostScriptPlugin.extract_date}",
70	'type' => "flag" },
71	{ 'name' => "extract_pages",
72	'desc' => "{PostScriptPlugin.extract_pages}",
73	'type' => "flag" },
74	{ 'name' => "extract_title",
75	'desc' => "{PostScriptPlugin.extract_title}",
76	'type' => "flag" } ];
77
78	my $options = { 'name' => "PostScriptPlugin",
79	'desc' => "{PostScriptPlugin.desc}",
80	'abstract' => "no",
81	'inherits' => "yes",
82	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	#push(@$inputargs,"-convert_to");
91	#push(@$inputargs,"text");
92	push(@$inputargs,"-title_sub");
93	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96	push(@{$hashArgOptLists->{"OptList"}},$options);
97
98	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100	if ($self->{'info_only'}) {
101	# don't worry about any options etc
102	return bless $self, $class;
103	}
104
105	$self->{'filename_extension'} = "ps";
106	$self->{'file_type'} = "PS";
107
108	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
109
110	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
111	$secondary_plugin_options->{'TextPlugin'} = [];
112	}
113
114	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
115	$secondary_plugin_options->{'HTMLPlugin'} = [];
116	}
117
118	my $text_options = $secondary_plugin_options->{'TextPlugin'};
119	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
120
121	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage\|pagedimg).*/i) {
122	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
123	$secondary_plugin_options->{'PagedImagePlugin'} = [];
124	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
125	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126	push(@$pagedimg_options, "-file_rename_method", "none");
127	push(@$pagedimg_options, "-processing_tmp_files");
128	}
129	}
130	# following title_sub removes "Page 1" added by ps2ascii, and a leading
131	# "1", which is often the page number at the top of the page. Bad Luck
132	# if your document title actually starts with "1 " - is there a better way?
133	#$self->{'input_encoding'} = "utf8";
134	#$self->{'extract_language'} = 1;
135	push(@$text_options, "-input_encoding", "utf8");
136	push(@$text_options,"-extract_language") if $self->{'extract_language'};
137	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
138
139	push(@$text_options, "-file_rename_method", "none");
140	push(@$html_options, "-file_rename_method", "none");
141
142	# tell the secondary plugins that they are processing tmp files
143	push(@$html_options, "-processing_tmp_files");
144
145	$self = bless $self, $class;
146	# used for convert_post_process
147	$self->{'input_encoding'} = "auto";
148	$self->{'default_encoding'} = "utf8";
149
150	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
151
152	return $self;
153	}
154
155
156	sub get_default_block_exp {
157	my $self = shift (@_);
158
159	return q^(?i)\.(eps)$^;
160	}
161
162	sub get_default_process_exp {
163	my $self = shift (@_);
164
165	return q^(?i)\.ps$^;
166	}
167
168	# this has been commented out in other plugins. do we need it here?
169	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
170	sub convert_post_process
171	{
172	my $self = shift (@_);
173	my ($conv_filename) = @_;
174
175	my $outhandle=$self->{'outhandle'};
176
177	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
178
179	# read in file ($text will be in utf8)
180	my $text = "";
181	$self->read_file ($conv_filename, $encoding, $language, \$text);
182
183	# turn any high bytes that aren't valid utf-8 into utf-8.
184	unicode::ensure_utf8(\$text);
185
186	# Write it out again!
187	$self->utf8_write_file (\$text, $conv_filename);
188	}
189
190	sub extract_metadata_from_postscript {
191	my $self = shift (@_);
192
193	my ($filename,$doc) = @_;
194
195	my $section = $doc->get_top_section();
196
197	my $title_found = 0;
198	my $pages_found = 0;
199	my $date_found = 0;
200
201	print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
202	if $self->{'verbosity'} > 1;
203
204	open(INPUT, "<$filename");
205	my $date;
206
207	while(my $line =<INPUT>) {
208	if ($self->{'extract_title'} && !$title_found) {
209	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
210	my $new_word = $word;
211	$new_word =~ s/\(Untitled\)//i;
212	$new_word =~ s/\(Microsoft Word\)//i;
213	$new_word =~ s/Microsoft Word//i;
214	$new_word =~ s/^\(//i;
215	$new_word =~ s/\)$//i;
216	$new_word =~ s/^ - //i;
217	if ($new_word ne "") {
218	$doc->add_utf8_metadata($section, "Title", $new_word );
219	$title_found = 1;
220	}
221	}
222	}
223	if ($self->{'extract_date'} && !$date_found) {
224	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
225	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
226	$date = &sorttools::format_date($2,$1,$3);
227	if (defined $date) {
228	$doc->add_utf8_metadata($section, "Date", $date );
229	}
230	}
231	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
232	$date = &sorttools::format_date($3,$2,$1);
233	if (defined $date) {
234	$doc->add_utf8_metadata($section, "Date", $date );
235	}
236	}
237	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
238	$date = &sorttools::format_date($3,$2,$1);
239	if (defined $date) {
240	$doc->add_utf8_metadata($section, "Date", $date );
241	}
242	}
243	$date_found = 1;
244	}
245	}
246	if ($self->{'extract_pages'} && !$pages_found) {
247	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
248	my $digits = $word;
249	$digits =~ s/[^0-9]//g;
250	if ($digits ne "" && $digits ne "0") {
251	$doc->add_utf8_metadata($section, "Pages", $digits );
252	$pages_found = 1;
253	}
254	}
255	}
256	}
257	}
258
259	# do plugin specific processing of doc_obj
260	sub process {
261	my $self = shift (@_);
262	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263
264	my $filename = &util::filename_cat($base_dir,$file);
265	$self->extract_metadata_from_postscript($filename, $doc_obj);
266
267	return $self->SUPER::process(@_);
268
269	}
270
271
272	1;
273

Note: See TracBrowser for help on using the repository browser.

Download in other formats: