Context Navigation

source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 10537

Last change on this file since 10537 was 10536, checked in by chi, 19 years ago
Modification of adding pagedimg types of conversion for PS documents. So now PS documents can be converted to TEXT, PAGEDIMG_JPG, PAGEDIMG_GIF, PAGEDIMG_GIF types.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.8 KB

Line
1	###########################################################################
2	#
3	# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# 12/05/02 Added usage datastructure - John Thompson
27
28	package PSPlug;
29
30	use ConvertToPlug;
31	use sorttools;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	sub BEGIN {
37	@PSPlug::ISA = ('ConvertToPlug');
38	}
39
40	my $convert_to_list =
41	[ { 'name' => "auto",
42	'desc' => "{ConvertToPlug.convert_to.auto}" },
43	{ 'name' => "text",
44	'desc' => "{ConvertToPlug.convert_to.text}" },
45	{ 'name' => "pagedimg_jpg",
46	'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}" },
47	{ 'name' => "pagedimg_gif",
48	'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}" },
49	{ 'name' => "pagedimg_png",
50	'desc' => "{ConvertToPlug.convert_to.pagedimg_png}" }
51	];
52
53	my $arguments =
54	[ { 'name' => "convert_to",
55	'desc' => "{ConvertToPlug.convert_to}",
56	'type' => "enum",
57	'reqd' => "yes",
58	'list' => $convert_to_list,
59	'deft' => "html" },
60	{ 'name' => "process_exp",
61	'desc' => "{BasPlug.process_exp}",
62	'type' => "regexp",
63	'deft' => &get_default_process_exp(),
64	'reqd' => "no" },
65	{ 'name' => "block_exp",
66	'desc' => "{BasPlug.block_exp}",
67	'type' => 'regexp',
68	'deft' => &get_default_block_exp() },
69	{ 'name' => "extract_date",
70	'desc' => "{PSPlug.extract_date}",
71	'type' => "flag" },
72	{ 'name' => "extract_pages",
73	'desc' => "{PSPlug.extract_pages}",
74	'type' => "flag" },
75	{ 'name' => "extract_title",
76	'desc' => "{PSPlug.extract_title}",
77	'type' => "flag" } ];
78
79	my $options = { 'name' => "PSPlug",
80	'desc' => "{PSPlug.desc}",
81	'abstract' => "no",
82	'inherits' => "yes",
83	'args' => $arguments };
84
85	sub new {
86	my ($class) = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
88	push(@$pluginlist, $class);
89
90	#push(@$inputargs,"-convert_to");
91	#push(@$inputargs,"text");
92	push(@$inputargs,"-title_sub");
93	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
94
95	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
96	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
97
98	my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
99
100	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
101
102	if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
103	$secondary_plugin_options->{'TEXTPlug'} = [];
104	}
105
106	my $text_options = $secondary_plugin_options->{'TEXTPlug'};
107
108	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
109	if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
110	$secondary_plugin_options->{'PagedImgPlug'} = [];
111	my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
112	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
113	}
114	}
115	# following title_sub removes "Page 1" added by ps2ascii, and a leading
116	# "1", which is often the page number at the top of the page. Bad Luck
117	# if your document title actually starts with "1 " - is there a better way?
118	#$self->{'input_encoding'} = "utf8";
119	#$self->{'extract_language'} = 1;
120	push(@$text_options, "-input_encoding", "utf8");
121	push(@$text_options,"-extract_language");
122	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
123
124	$self = bless $self, $class;
125
126	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
127
128	return $self;
129	}
130
131
132	sub get_default_block_exp {
133	my $self = shift (@_);
134
135	return q^(?i)\.(eps)$^;
136	}
137
138	sub get_default_process_exp {
139	my $self = shift (@_);
140
141	return q^(?i)\.ps$^;
142	}
143
144	sub convert_post_process
145	{
146	my $self = shift (@_);
147	my ($conv_filename) = @_;
148
149	my $outhandle=$self->{'outhandle'};
150
151	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
152
153	# read in file ($text will be in utf8)
154	my $text = "";
155	$self->read_file ($conv_filename, $encoding, $language, \$text);
156
157	# turn any high bytes that aren't valid utf-8 into utf-8.
158	unicode::ensure_utf8(\$text);
159
160	# Write it out again!
161	$self->utf8_write_file (\$text, $conv_filename);
162	}
163
164	sub extract_metadata_from_postscript {
165	my $self = shift (@_);
166
167	my ($filename,$doc) = @_;
168
169	my $section = $doc->get_top_section();
170
171	my $title_found = 0;
172	my $pages_found = 0;
173	my $date_found = 0;
174
175	print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
176	if $self->{'verbosity'} > 1;
177
178	open(INPUT, "<$filename");
179	my $date;
180
181	while(my $line =<INPUT>) {
182	if ($self->{'extract_title'} && !$title_found) {
183	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
184	my $new_word = $word;
185	$new_word =~ s/\(Untitled\)//i;
186	$new_word =~ s/\(Microsoft Word\)//i;
187	$new_word =~ s/Microsoft Word//i;
188	$new_word =~ s/^\(//i;
189	$new_word =~ s/\)$//i;
190	$new_word =~ s/^ - //i;
191	if ($new_word ne "") {
192	$doc->add_utf8_metadata($section, "Title", $new_word );
193	$title_found = 1;
194	}
195	}
196	}
197	if ($self->{'extract_date'} && !$date_found) {
198	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
199	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
200	$date = &sorttools::format_date($2,$1,$3);
201	if (defined $date) {
202	$doc->add_utf8_metadata($section, "Date", $date );
203	}
204	}
205	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
206	$date = &sorttools::format_date($3,$2,$1);
207	if (defined $date) {
208	$doc->add_utf8_metadata($section, "Date", $date );
209	}
210	}
211	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
212	$date = &sorttools::format_date($3,$2,$1);
213	if (defined $date) {
214	$doc->add_utf8_metadata($section, "Date", $date );
215	}
216	}
217	$date_found = 1;
218	}
219	}
220	if ($self->{'extract_pages'} && !$pages_found) {
221	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
222	my $digits = $word;
223	$digits =~ s/[^0-9]//g;
224	if ($digits ne "" && $digits ne "0") {
225	$doc->add_utf8_metadata($section, "Pages", $digits );
226	$pages_found = 1;
227	}
228	}
229	}
230	}
231	}
232
233	# do plugin specific processing of doc_obj for HTML type
234	sub process {
235	my $self = shift (@_);
236	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
237
238	# my $outhandle = $self->{'outhandle'};
239
240	# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
241	# if $self->{'verbosity'} > 1;
242	# print STDERR "<Processing n='$file' p='PSPlug'>\n" if ($gli);
243
244	my $filename = &util::filename_cat($base_dir,$file);
245	$self->extract_metadata_from_postscript($filename, $doc_obj);
246
247	return $self->process_type("ps",$base_dir,$file,$doc_obj);
248	}
249
250
251	1;
252

Note: See TracBrowser for help on using the repository browser.

Download in other formats: