Context Navigation

source: gsdl/trunk/perllib/plugins/PostScriptPlugin.pm@ 18406

Last change on this file since 18406 was 18406, checked in by ak19, 15 years ago
Modified srcreplaceable plugins (plugins which operate on docs where the source file can be replaced with their converted htmls) to set the file_rename_method to none for secondary plugins (for Text, HTML, and PagedImage plugins) so that the file is not renamed several times.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# PostScriptPlugin.pm -- plugin to process PostScript files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PostScriptPlugin;
27
28	use ConvertBinaryFile;
29	use ReadTextFile; # for read_file in convert_post_process. do we need it?
30	use sorttools;
31
32	use strict;
33	no strict 'refs'; # allow filehandles to be variables and viceversa
34
35	sub BEGIN {
36	@PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37	}
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50	];
51
52	my $arguments =
53	[ { 'name' => "convert_to",
54	'desc' => "{ConvertBinaryFile.convert_to}",
55	'type' => "enum",
56	'reqd' => "yes",
57	'list' => $convert_to_list,
58	'deft' => "text" },
59	{ 'name' => "process_exp",
60	'desc' => "{BasePlugin.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp(),
63	'reqd' => "no" },
64	{ 'name' => "block_exp",
65	'desc' => "{BasePlugin.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "extract_date",
69	'desc' => "{PostScriptPlugin.extract_date}",
70	'type' => "flag" },
71	{ 'name' => "extract_pages",
72	'desc' => "{PostScriptPlugin.extract_pages}",
73	'type' => "flag" },
74	{ 'name' => "extract_title",
75	'desc' => "{PostScriptPlugin.extract_title}",
76	'type' => "flag" } ];
77
78	my $options = { 'name' => "PostScriptPlugin",
79	'desc' => "{PostScriptPlugin.desc}",
80	'abstract' => "no",
81	'inherits' => "yes",
82	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	#push(@$inputargs,"-convert_to");
91	#push(@$inputargs,"text");
92	push(@$inputargs,"-title_sub");
93	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
96	push(@{$hashArgOptLists->{"OptList"}},$options);
97
98	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
99
100	if ($self->{'info_only'}) {
101	# don't worry about any options etc
102	return bless $self, $class;
103	}
104
105	$self->{'filename_extension'} = "ps";
106	$self->{'file_type'} = "PS";
107
108	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
109
110	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
111	$secondary_plugin_options->{'TextPlugin'} = [];
112	}
113
114	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
115	$secondary_plugin_options->{'HTMLPlugin'} = [];
116	}
117
118	my $text_options = $secondary_plugin_options->{'TextPlugin'};
119	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
120
121	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage\|pagedimg).*/i) {
122	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
123	$secondary_plugin_options->{'PagedImagePlugin'} = [];
124	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
125	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
126	push(@$pagedimg_options, "-file_rename_method", "none");
127	}
128	}
129	# following title_sub removes "Page 1" added by ps2ascii, and a leading
130	# "1", which is often the page number at the top of the page. Bad Luck
131	# if your document title actually starts with "1 " - is there a better way?
132	#$self->{'input_encoding'} = "utf8";
133	#$self->{'extract_language'} = 1;
134	push(@$text_options, "-input_encoding", "utf8");
135	push(@$text_options,"-extract_language") if $self->{'extract_language'};
136	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
137
138	push(@$text_options, "-file_rename_method", "none");
139	push(@$html_options, "-file_rename_method", "none");
140
141	$self = bless $self, $class;
142	# used for convert_post_process
143	$self->{'input_encoding'} = "auto";
144	$self->{'default_encoding'} = "utf8";
145
146	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
147
148	return $self;
149	}
150
151
152	sub get_default_block_exp {
153	my $self = shift (@_);
154
155	return q^(?i)\.(eps)$^;
156	}
157
158	sub get_default_process_exp {
159	my $self = shift (@_);
160
161	return q^(?i)\.ps$^;
162	}
163
164	# this has been commented out in other plugins. do we need it here?
165	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
166	sub convert_post_process
167	{
168	my $self = shift (@_);
169	my ($conv_filename) = @_;
170
171	my $outhandle=$self->{'outhandle'};
172
173	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
174
175	# read in file ($text will be in utf8)
176	my $text = "";
177	$self->read_file ($conv_filename, $encoding, $language, \$text);
178
179	# turn any high bytes that aren't valid utf-8 into utf-8.
180	unicode::ensure_utf8(\$text);
181
182	# Write it out again!
183	$self->utf8_write_file (\$text, $conv_filename);
184	}
185
186	sub extract_metadata_from_postscript {
187	my $self = shift (@_);
188
189	my ($filename,$doc) = @_;
190
191	my $section = $doc->get_top_section();
192
193	my $title_found = 0;
194	my $pages_found = 0;
195	my $date_found = 0;
196
197	print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
198	if $self->{'verbosity'} > 1;
199
200	open(INPUT, "<$filename");
201	my $date;
202
203	while(my $line =<INPUT>) {
204	if ($self->{'extract_title'} && !$title_found) {
205	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
206	my $new_word = $word;
207	$new_word =~ s/\(Untitled\)//i;
208	$new_word =~ s/\(Microsoft Word\)//i;
209	$new_word =~ s/Microsoft Word//i;
210	$new_word =~ s/^\(//i;
211	$new_word =~ s/\)$//i;
212	$new_word =~ s/^ - //i;
213	if ($new_word ne "") {
214	$doc->add_utf8_metadata($section, "Title", $new_word );
215	$title_found = 1;
216	}
217	}
218	}
219	if ($self->{'extract_date'} && !$date_found) {
220	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
221	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
222	$date = &sorttools::format_date($2,$1,$3);
223	if (defined $date) {
224	$doc->add_utf8_metadata($section, "Date", $date );
225	}
226	}
227	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
228	$date = &sorttools::format_date($3,$2,$1);
229	if (defined $date) {
230	$doc->add_utf8_metadata($section, "Date", $date );
231	}
232	}
233	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
234	$date = &sorttools::format_date($3,$2,$1);
235	if (defined $date) {
236	$doc->add_utf8_metadata($section, "Date", $date );
237	}
238	}
239	$date_found = 1;
240	}
241	}
242	if ($self->{'extract_pages'} && !$pages_found) {
243	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
244	my $digits = $word;
245	$digits =~ s/[^0-9]//g;
246	if ($digits ne "" && $digits ne "0") {
247	$doc->add_utf8_metadata($section, "Pages", $digits );
248	$pages_found = 1;
249	}
250	}
251	}
252	}
253	}
254
255	# do plugin specific processing of doc_obj
256	sub process {
257	my $self = shift (@_);
258	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
259
260	my $filename = &util::filename_cat($base_dir,$file);
261	$self->extract_metadata_from_postscript($filename, $doc_obj);
262
263	return $self->SUPER::process(@_);
264
265	}
266
267
268	1;
269

Note: See TracBrowser for help on using the repository browser.

Download in other formats: