Context Navigation

source: gsdl/trunk/perllib/plugins/PSPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 8.0 KB

Line
1	###########################################################################
2	#
3	# PSPlugin.pm -- this might look VERY similar to the PDF plugin...
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# 12/05/02 Added usage datastructure - John Thompson
27
28	package PSPlugin;
29
30	use ConvertBinaryFile;
31	use ReadTextFile; # for read_file in convert_post_process. do we need it?
32	use sorttools;
33
34	use strict;
35	no strict 'refs'; # allow filehandles to be variables and viceversa
36
37	sub BEGIN {
38	@PSPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
39	}
40
41	my $convert_to_list =
42	[ { 'name' => "auto",
43	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
44	{ 'name' => "text",
45	'desc' => "{ConvertBinaryFile.convert_to.text}" },
46	{ 'name' => "pagedimg_jpg",
47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
48	{ 'name' => "pagedimg_gif",
49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
50	{ 'name' => "pagedimg_png",
51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
52	];
53
54	my $arguments =
55	[ { 'name' => "convert_to",
56	'desc' => "{ConvertBinaryFile.convert_to}",
57	'type' => "enum",
58	'reqd' => "yes",
59	'list' => $convert_to_list,
60	'deft' => "text" },
61	{ 'name' => "process_exp",
62	'desc' => "{BasePlugin.process_exp}",
63	'type' => "regexp",
64	'deft' => &get_default_process_exp(),
65	'reqd' => "no" },
66	{ 'name' => "block_exp",
67	'desc' => "{BasePlugin.block_exp}",
68	'type' => 'regexp',
69	'deft' => &get_default_block_exp() },
70	{ 'name' => "extract_date",
71	'desc' => "{PSPlugin.extract_date}",
72	'type' => "flag" },
73	{ 'name' => "extract_pages",
74	'desc' => "{PSPlugin.extract_pages}",
75	'type' => "flag" },
76	{ 'name' => "extract_title",
77	'desc' => "{PSPlugin.extract_title}",
78	'type' => "flag" } ];
79
80	my $options = { 'name' => "PSPlugin",
81	'desc' => "{PSPlugin.desc}",
82	'abstract' => "no",
83	'inherits' => "yes",
84	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
85	'args' => $arguments };
86
87	sub new {
88	my ($class) = shift (@_);
89	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
90	push(@$pluginlist, $class);
91
92	#push(@$inputargs,"-convert_to");
93	#push(@$inputargs,"text");
94	push(@$inputargs,"-title_sub");
95	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
96
97	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98	push(@{$hashArgOptLists->{"OptList"}},$options);
99
100	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
101
102	if ($self->{'info_only'}) {
103	# don't worry about any options etc
104	return bless $self, $class;
105	}
106
107	$self->{'filename_extension'} = "ps";
108	$self->{'file_type'} = "PS";
109
110	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
111
112	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
113	$secondary_plugin_options->{'TextPlugin'} = [];
114	}
115
116	my $text_options = $secondary_plugin_options->{'TextPlugin'};
117
118	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
119	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
120	$secondary_plugin_options->{'PagedImagePlugin'} = [];
121	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
122	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123	}
124	}
125	# following title_sub removes "Page 1" added by ps2ascii, and a leading
126	# "1", which is often the page number at the top of the page. Bad Luck
127	# if your document title actually starts with "1 " - is there a better way?
128	#$self->{'input_encoding'} = "utf8";
129	#$self->{'extract_language'} = 1;
130	push(@$text_options, "-input_encoding", "utf8");
131	push(@$text_options,"-extract_language") if $self->{'extract_language'};
132	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
133
134	$self = bless $self, $class;
135
136	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
137
138	return $self;
139	}
140
141
142	sub get_default_block_exp {
143	my $self = shift (@_);
144
145	return q^(?i)\.(eps)$^;
146	}
147
148	sub get_default_process_exp {
149	my $self = shift (@_);
150
151	return q^(?i)\.ps$^;
152	}
153
154	# this has been commented out in other plugins. do we need it here?
155	sub convert_post_process
156	{
157	my $self = shift (@_);
158	my ($conv_filename) = @_;
159
160	my $outhandle=$self->{'outhandle'};
161
162	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
163
164	# read in file ($text will be in utf8)
165	my $text = "";
166	$self->read_file ($conv_filename, $encoding, $language, \$text);
167
168	# turn any high bytes that aren't valid utf-8 into utf-8.
169	unicode::ensure_utf8(\$text);
170
171	# Write it out again!
172	$self->utf8_write_file (\$text, $conv_filename);
173	}
174
175	sub extract_metadata_from_postscript {
176	my $self = shift (@_);
177
178	my ($filename,$doc) = @_;
179
180	my $section = $doc->get_top_section();
181
182	my $title_found = 0;
183	my $pages_found = 0;
184	my $date_found = 0;
185
186	print STDERR "PSPlugin: extracting PostScript metadata from \"$filename\"\n"
187	if $self->{'verbosity'} > 1;
188
189	open(INPUT, "<$filename");
190	my $date;
191
192	while(my $line =<INPUT>) {
193	if ($self->{'extract_title'} && !$title_found) {
194	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
195	my $new_word = $word;
196	$new_word =~ s/\(Untitled\)//i;
197	$new_word =~ s/\(Microsoft Word\)//i;
198	$new_word =~ s/Microsoft Word//i;
199	$new_word =~ s/^\(//i;
200	$new_word =~ s/\)$//i;
201	$new_word =~ s/^ - //i;
202	if ($new_word ne "") {
203	$doc->add_utf8_metadata($section, "Title", $new_word );
204	$title_found = 1;
205	}
206	}
207	}
208	if ($self->{'extract_date'} && !$date_found) {
209	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
210	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
211	$date = &sorttools::format_date($2,$1,$3);
212	if (defined $date) {
213	$doc->add_utf8_metadata($section, "Date", $date );
214	}
215	}
216	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
217	$date = &sorttools::format_date($3,$2,$1);
218	if (defined $date) {
219	$doc->add_utf8_metadata($section, "Date", $date );
220	}
221	}
222	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
223	$date = &sorttools::format_date($3,$2,$1);
224	if (defined $date) {
225	$doc->add_utf8_metadata($section, "Date", $date );
226	}
227	}
228	$date_found = 1;
229	}
230	}
231	if ($self->{'extract_pages'} && !$pages_found) {
232	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
233	my $digits = $word;
234	$digits =~ s/[^0-9]//g;
235	if ($digits ne "" && $digits ne "0") {
236	$doc->add_utf8_metadata($section, "Pages", $digits );
237	$pages_found = 1;
238	}
239	}
240	}
241	}
242	}
243
244	# do plugin specific processing of doc_obj
245	sub process {
246	my $self = shift (@_);
247	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
248
249	my $filename = &util::filename_cat($base_dir,$file);
250	$self->extract_metadata_from_postscript($filename, $doc_obj);
251
252	return $self->SUPER::process(@_);
253
254	}
255
256
257	1;
258

Note: See TracBrowser for help on using the repository browser.

Download in other formats: