Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	###########################################################################
2	#
3	# PostScriptPlugin.pm -- plugin to process PostScript files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PostScriptPlugin;
27
28	use ConvertBinaryFile;
29	use ReadTextFile; # for read_file in convert_post_process. do we need it?
30	use sorttools;
31
32	use strict;
33	no strict 'refs'; # allow filehandles to be variables and viceversa
34
35	sub BEGIN {
36	@PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
37	}
38
39	my $convert_to_list =
40	[ { 'name' => "auto",
41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
42	{ 'name' => "text",
43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
44	{ 'name' => "pagedimg_jpg",
45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
46	{ 'name' => "pagedimg_gif",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
48	{ 'name' => "pagedimg_png",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
50	];
51
52	my $arguments =
53	[ { 'name' => "convert_to",
54	'desc' => "{ConvertBinaryFile.convert_to}",
55	'type' => "enum",
56	'reqd' => "yes",
57	'list' => $convert_to_list,
58	'deft' => "text" },
59	{ 'name' => "process_exp",
60	'desc' => "{BaseImporter.process_exp}",
61	'type' => "regexp",
62	'deft' => &get_default_process_exp(),
63	'reqd' => "no" },
64	{ 'name' => "block_exp",
65	'desc' => "{BaseImporter.block_exp}",
66	'type' => 'regexp',
67	'deft' => &get_default_block_exp() },
68	{ 'name' => "extract_date",
69	'desc' => "{PostScriptPlugin.extract_date}",
70	'type' => "flag" },
71	{ 'name' => "extract_pages",
72	'desc' => "{PostScriptPlugin.extract_pages}",
73	'type' => "flag" },
74	{ 'name' => "extract_title",
75	'desc' => "{PostScriptPlugin.extract_title}",
76	'type' => "flag" } ];
77
78	my $options = { 'name' => "PostScriptPlugin",
79	'desc' => "{PostScriptPlugin.desc}",
80	'abstract' => "no",
81	'inherits' => "yes",
82	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	push(@$inputargs,"-title_sub");
91	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
92
93	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94	push(@{$hashArgOptLists->{"OptList"}},$options);
95
96	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
97
98	if ($self->{'info_only'}) {
99	# don't worry about any options etc
100	return bless $self, $class;
101	}
102
103	$self->{'file_type'} = "PS";
104
105	if ($self->{'convert_to'} eq "auto") {
106	$self->{'convert_to'} = "text";
107	}
108
109	# set convert_to_plugin and convert_to_ext
110	$self->set_standard_convert_settings();
111	my $secondary_plugin_name = $self->{'convert_to_plugin'};
112	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
113
114	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
115	$secondary_plugin_options->{$secondary_plugin_name} = [];
116	}
117	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
118
119	# following title_sub removes "Page 1" added by ps2ascii, and a leading
120	# "1", which is often the page number at the top of the page. Bad Luck
121	# if your document title actually starts with "1 " - is there a better way?
122	push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123	push(@$specific_options, "-file_rename_method", "none");
124
125	if ($secondary_plugin_name eq "TextPlugin") {
126	push(@$specific_options, "-input_encoding", "utf8");
127	push(@$specific_options,"-extract_language") if $self->{'extract_language'};
128	} elsif ($secondary_plugin_name eq "PagedImagePlugin") {
129	push(@$specific_options, "-processing_tmp_files");
130	}
131
132	$self = bless $self, $class;
133	# used for convert_post_process
134	$self->{'input_encoding'} = "auto";
135	$self->{'default_encoding'} = "utf8";
136
137	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
138
139	return $self;
140	}
141
142
143	sub get_default_block_exp {
144	my $self = shift (@_);
145
146	return q^(?i)\.(eps)$^;
147	}
148
149	sub get_default_process_exp {
150	my $self = shift (@_);
151
152	return q^(?i)\.ps$^;
153	}
154
155	# this has been commented out in other plugins. do we need it here?
156	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
157	sub convert_post_process
158	{
159	my $self = shift (@_);
160	my ($conv_filename) = @_;
161
162	my $outhandle=$self->{'outhandle'};
163
164	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
165
166	# read in file ($text will be in utf8)
167	my $text = "";
168	$self->read_file ($conv_filename, $encoding, $language, \$text);
169
170	# turn any high bytes that aren't valid utf-8 into utf-8.
171	unicode::ensure_utf8(\$text);
172
173	# Write it out again!
174	$self->utf8_write_file (\$text, $conv_filename);
175	}
176
177	sub extract_metadata_from_postscript {
178	my $self = shift (@_);
179
180	my ($filename,$doc) = @_;
181
182	my $section = $doc->get_top_section();
183
184	my $title_found = 0;
185	my $pages_found = 0;
186	my $date_found = 0;
187
188	print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
189	if $self->{'verbosity'} > 1;
190
191	open(INPUT, "<$filename");
192	my $date;
193
194	while(my $line =<INPUT>) {
195	if ($self->{'extract_title'} && !$title_found) {
196	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
197	my $new_word = $word;
198	$new_word =~ s/\(Untitled\)//i;
199	$new_word =~ s/\(Microsoft Word\)//i;
200	$new_word =~ s/Microsoft Word//i;
201	$new_word =~ s/^\(//i;
202	$new_word =~ s/\)$//i;
203	$new_word =~ s/^ - //i;
204	if ($new_word ne "") {
205	$doc->add_utf8_metadata($section, "Title", $new_word );
206	$title_found = 1;
207	}
208	}
209	}
210	if ($self->{'extract_date'} && !$date_found) {
211	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
212	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
213	$date = &sorttools::format_date($2,$1,$3);
214	if (defined $date) {
215	$doc->add_utf8_metadata($section, "Date", $date );
216	}
217	}
218	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
219	$date = &sorttools::format_date($3,$2,$1);
220	if (defined $date) {
221	$doc->add_utf8_metadata($section, "Date", $date );
222	}
223	}
224	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
225	$date = &sorttools::format_date($3,$2,$1);
226	if (defined $date) {
227	$doc->add_utf8_metadata($section, "Date", $date );
228	}
229	}
230	$date_found = 1;
231	}
232	}
233	if ($self->{'extract_pages'} && !$pages_found) {
234	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
235	my $digits = $word;
236	$digits =~ s/[^0-9]//g;
237	if ($digits ne "" && $digits ne "0") {
238	$doc->add_utf8_metadata($section, "Pages", $digits );
239	$pages_found = 1;
240	}
241	}
242	}
243	}
244	}
245
246	# do plugin specific processing of doc_obj
247	sub process {
248	my $self = shift (@_);
249	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
250
251	my $filename = &util::filename_cat($base_dir,$file);
252	$self->extract_metadata_from_postscript($filename, $doc_obj);
253
254	return $self->SUPER::process(@_);
255
256	}
257
258
259	1;
260

Note: See TracBrowser for help on using the repository browser.

Download in other formats: