Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 22689

Last change on this file since 22689 was 22597, checked in by kjdon, 14 years ago
code tidy up. rearranged how convertbinaryfile plugins set up their secondary plugins - now only set up the options for the one they are using. all subclass specific code moved out of convertbinaryfile.new into the appropriate plugin file.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	###########################################################################
2	#
3	# PostScriptPlugin.pm -- plugin to process PostScript files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PostScriptPlugin;
27
28	use ConvertBinaryFile;
29	use ReadTextFile; # for read_file in convert_post_process. do we need it?
30	use sorttools;
31
32	use strict;
33	no strict 'refs'; # allow filehandles to be variables and viceversa
34
35	sub BEGIN {
36	@PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37	}
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50	];
51
52	my $arguments =
53	[ { 'name' => "convert_to",
54	'desc' => "{ConvertBinaryFile.convert_to}",
55	'type' => "enum",
56	'reqd' => "yes",
57	'list' => $convert_to_list,
58	'deft' => "text" },
59	{ 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp(),
63	'reqd' => "no" },
64	{ 'name' => "block_exp",
65	'desc' => "{BasePlugin.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "extract_date",
69	'desc' => "{PostScriptPlugin.extract_date}",
70	'type' => "flag" },
71	{ 'name' => "extract_pages",
72	'desc' => "{PostScriptPlugin.extract_pages}",
73	'type' => "flag" },
74	{ 'name' => "extract_title",
75	'desc' => "{PostScriptPlugin.extract_title}",
76	'type' => "flag" } ];
77
78	my $options = { 'name' => "PostScriptPlugin",
79	'desc' => "{PostScriptPlugin.desc}",
80	'abstract' => "no",
81	'inherits' => "yes",
82	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@$inputargs,"-title_sub");
91	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
92
93	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94	push(@{$hashArgOptLists->{"OptList"}},$options);
95
96	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
97
98	if ($self->{'info_only'}) {
99	# don't worry about any options etc
100	return bless $self, $class;
101	}
102
103	$self->{'filename_extension'} = "ps";
104	$self->{'file_type'} = "PS";
105
106	if ($self->{'convert_to'} eq "auto") {
107	$self->{'convert_to'} = "text";
108	}
109
110	# set convert_to_plugin and convert_to_ext
111	$self->ConvertBinaryFile::set_standard_convert_settings();
112	my $secondary_plugin_name = $self->{'convert_to_plugin'};
113	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
114
115	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
116	$secondary_plugin_options->{$secondary_plugin_name} = [];
117	}
118	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
119
120	# following title_sub removes "Page 1" added by ps2ascii, and a leading
121	# "1", which is often the page number at the top of the page. Bad Luck
122	# if your document title actually starts with "1 " - is there a better way?
123	push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
124	push(@$specific_options, "-file_rename_method", "none");
125
126	if ($secondary_plugin_name eq "TextPlugin") {
127	push(@$specific_options, "-input_encoding", "utf8");
128	push(@$specific_options,"-extract_language") if $self->{'extract_language'};
129	} elsif ($secondary_plugin_name eq "PagedImagePlugin") {
130	push(@$specific_options, "-processing_tmp_files");
131	}
132
133	$self = bless $self, $class;
134	# used for convert_post_process
135	$self->{'input_encoding'} = "auto";
136	$self->{'default_encoding'} = "utf8";
137
138	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
139
140	return $self;
141	}
142
143
144	sub get_default_block_exp {
145	my $self = shift (@_);
146
147	return q^(?i)\.(eps)$^;
148	}
149
150	sub get_default_process_exp {
151	my $self = shift (@_);
152
153	return q^(?i)\.ps$^;
154	}
155
156	# this has been commented out in other plugins. do we need it here?
157	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
158	sub convert_post_process
159	{
160	my $self = shift (@_);
161	my ($conv_filename) = @_;
162
163	my $outhandle=$self->{'outhandle'};
164
165	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
166
167	# read in file ($text will be in utf8)
168	my $text = "";
169	$self->read_file ($conv_filename, $encoding, $language, \$text);
170
171	# turn any high bytes that aren't valid utf-8 into utf-8.
172	unicode::ensure_utf8(\$text);
173
174	# Write it out again!
175	$self->utf8_write_file (\$text, $conv_filename);
176	}
177
178	sub extract_metadata_from_postscript {
179	my $self = shift (@_);
180
181	my ($filename,$doc) = @_;
182
183	my $section = $doc->get_top_section();
184
185	my $title_found = 0;
186	my $pages_found = 0;
187	my $date_found = 0;
188
189	print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
190	if $self->{'verbosity'} > 1;
191
192	open(INPUT, "<$filename");
193	my $date;
194
195	while(my $line =<INPUT>) {
196	if ($self->{'extract_title'} && !$title_found) {
197	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
198	my $new_word = $word;
199	$new_word =~ s/\(Untitled\)//i;
200	$new_word =~ s/\(Microsoft Word\)//i;
201	$new_word =~ s/Microsoft Word//i;
202	$new_word =~ s/^\(//i;
203	$new_word =~ s/\)$//i;
204	$new_word =~ s/^ - //i;
205	if ($new_word ne "") {
206	$doc->add_utf8_metadata($section, "Title", $new_word );
207	$title_found = 1;
208	}
209	}
210	}
211	if ($self->{'extract_date'} && !$date_found) {
212	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
213	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
214	$date = &sorttools::format_date($2,$1,$3);
215	if (defined $date) {
216	$doc->add_utf8_metadata($section, "Date", $date );
217	}
218	}
219	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
220	$date = &sorttools::format_date($3,$2,$1);
221	if (defined $date) {
222	$doc->add_utf8_metadata($section, "Date", $date );
223	}
224	}
225	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
226	$date = &sorttools::format_date($3,$2,$1);
227	if (defined $date) {
228	$doc->add_utf8_metadata($section, "Date", $date );
229	}
230	}
231	$date_found = 1;
232	}
233	}
234	if ($self->{'extract_pages'} && !$pages_found) {
235	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
236	my $digits = $word;
237	$digits =~ s/[^0-9]//g;
238	if ($digits ne "" && $digits ne "0") {
239	$doc->add_utf8_metadata($section, "Pages", $digits );
240	$pages_found = 1;
241	}
242	}
243	}
244	}
245	}
246
247	# do plugin specific processing of doc_obj
248	sub process {
249	my $self = shift (@_);
250	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
251
252	my $filename = &util::filename_cat($base_dir,$file);
253	$self->extract_metadata_from_postscript($filename, $doc_obj);
254
255	return $self->SUPER::process(@_);
256
257	}
258
259
260	1;
261

Note: See TracBrowser for help on using the repository browser.

Download in other formats: