Context Navigation

source: gsdl/trunk/perllib/plugins/PSPlugin.pm@ 17330

Last change on this file since 17330 was 17330, checked in by kjdon, 16 years ago
added default values for self->input_encoding and self->default_encoding, to be used in textcat_get_language_encoding
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	###########################################################################
2	#
3	# PSPlugin.pm -- this might look VERY similar to the PDF plugin...
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# 12/05/02 Added usage datastructure - John Thompson
27
28	package PSPlugin;
29
30	use ConvertBinaryFile;
31	use ReadTextFile; # for read_file in convert_post_process. do we need it?
32	use sorttools;
33
34	use strict;
35	no strict 'refs'; # allow filehandles to be variables and viceversa
36
37	sub BEGIN {
38	@PSPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
39	}
40
41	my $convert_to_list =
42	[ { 'name' => "auto",
43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44	{ 'name' => "text",
45	'desc' => "{ConvertBinaryFile.convert_to.text}" },
46	{ 'name' => "pagedimg_jpg",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
48	{ 'name' => "pagedimg_gif",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
50	{ 'name' => "pagedimg_png",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
52	];
53
54	my $arguments =
55	[ { 'name' => "convert_to",
56	'desc' => "{ConvertBinaryFile.convert_to}",
57	'type' => "enum",
58	'reqd' => "yes",
59	'list' => $convert_to_list,
60	'deft' => "text" },
61	{ 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_process_exp(),
65	'reqd' => "no" },
66	{ 'name' => "block_exp",
67	'desc' => "{BasePlugin.block_exp}",
68	'type' => 'regexp',
69	'deft' => &get_default_block_exp() },
70	{ 'name' => "extract_date",
71	'desc' => "{PSPlugin.extract_date}",
72	'type' => "flag" },
73	{ 'name' => "extract_pages",
74	'desc' => "{PSPlugin.extract_pages}",
75	'type' => "flag" },
76	{ 'name' => "extract_title",
77	'desc' => "{PSPlugin.extract_title}",
78	'type' => "flag" } ];
79
80	my $options = { 'name' => "PSPlugin",
81	'desc' => "{PSPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
85	'args' => $arguments };
86
87	sub new {
88	my ($class) = shift (@_);
89	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90	push(@$pluginlist, $class);
91
92	#push(@$inputargs,"-convert_to");
93	#push(@$inputargs,"text");
94	push(@$inputargs,"-title_sub");
95	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
96
97	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98	push(@{$hashArgOptLists->{"OptList"}},$options);
99
100	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101
102	if ($self->{'info_only'}) {
103	# don't worry about any options etc
104	return bless $self, $class;
105	}
106
107	$self->{'filename_extension'} = "ps";
108	$self->{'file_type'} = "PS";
109
110	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
111
112	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
113	$secondary_plugin_options->{'TextPlugin'} = [];
114	}
115
116	my $text_options = $secondary_plugin_options->{'TextPlugin'};
117
118	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
119	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
120	$secondary_plugin_options->{'PagedImagePlugin'} = [];
121	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
122	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123	}
124	}
125	# following title_sub removes "Page 1" added by ps2ascii, and a leading
126	# "1", which is often the page number at the top of the page. Bad Luck
127	# if your document title actually starts with "1 " - is there a better way?
128	#$self->{'input_encoding'} = "utf8";
129	#$self->{'extract_language'} = 1;
130	push(@$text_options, "-input_encoding", "utf8");
131	push(@$text_options,"-extract_language") if $self->{'extract_language'};
132	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
133
134	$self = bless $self, $class;
135	# used for convert_post_process
136	$self->{'input_encoding'} = "auto";
137	$self->{'default_encoding'} = "utf8";
138
139	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
140
141	return $self;
142	}
143
144
145	sub get_default_block_exp {
146	my $self = shift (@_);
147
148	return q^(?i)\.(eps)$^;
149	}
150
151	sub get_default_process_exp {
152	my $self = shift (@_);
153
154	return q^(?i)\.ps$^;
155	}
156
157	# this has been commented out in other plugins. do we need it here?
158	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
159	sub convert_post_process
160	{
161	my $self = shift (@_);
162	my ($conv_filename) = @_;
163
164	my $outhandle=$self->{'outhandle'};
165
166	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
167
168	# read in file ($text will be in utf8)
169	my $text = "";
170	$self->read_file ($conv_filename, $encoding, $language, \$text);
171
172	# turn any high bytes that aren't valid utf-8 into utf-8.
173	unicode::ensure_utf8(\$text);
174
175	# Write it out again!
176	$self->utf8_write_file (\$text, $conv_filename);
177	}
178
179	sub extract_metadata_from_postscript {
180	my $self = shift (@_);
181
182	my ($filename,$doc) = @_;
183
184	my $section = $doc->get_top_section();
185
186	my $title_found = 0;
187	my $pages_found = 0;
188	my $date_found = 0;
189
190	print STDERR "PSPlugin: extracting PostScript metadata from \"$filename\"\n"
191	if $self->{'verbosity'} > 1;
192
193	open(INPUT, "<$filename");
194	my $date;
195
196	while(my $line =<INPUT>) {
197	if ($self->{'extract_title'} && !$title_found) {
198	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
199	my $new_word = $word;
200	$new_word =~ s/\(Untitled\)//i;
201	$new_word =~ s/\(Microsoft Word\)//i;
202	$new_word =~ s/Microsoft Word//i;
203	$new_word =~ s/^\(//i;
204	$new_word =~ s/\)$//i;
205	$new_word =~ s/^ - //i;
206	if ($new_word ne "") {
207	$doc->add_utf8_metadata($section, "Title", $new_word );
208	$title_found = 1;
209	}
210	}
211	}
212	if ($self->{'extract_date'} && !$date_found) {
213	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
214	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
215	$date = &sorttools::format_date($2,$1,$3);
216	if (defined $date) {
217	$doc->add_utf8_metadata($section, "Date", $date );
218	}
219	}
220	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
221	$date = &sorttools::format_date($3,$2,$1);
222	if (defined $date) {
223	$doc->add_utf8_metadata($section, "Date", $date );
224	}
225	}
226	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
227	$date = &sorttools::format_date($3,$2,$1);
228	if (defined $date) {
229	$doc->add_utf8_metadata($section, "Date", $date );
230	}
231	}
232	$date_found = 1;
233	}
234	}
235	if ($self->{'extract_pages'} && !$pages_found) {
236	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
237	my $digits = $word;
238	$digits =~ s/[^0-9]//g;
239	if ($digits ne "" && $digits ne "0") {
240	$doc->add_utf8_metadata($section, "Pages", $digits );
241	$pages_found = 1;
242	}
243	}
244	}
245	}
246	}
247
248	# do plugin specific processing of doc_obj
249	sub process {
250	my $self = shift (@_);
251	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
252
253	my $filename = &util::filename_cat($base_dir,$file);
254	$self->extract_metadata_from_postscript($filename, $doc_obj);
255
256	return $self->SUPER::process(@_);
257
258	}
259
260
261	1;
262

Note: See TracBrowser for help on using the repository browser.

Download in other formats: