Context Navigation

source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 2996

Last change on this file since 2996 was 2979, checked in by jrm21, 22 years ago
Use self->converted_to instead of convert_to, in case the file could not be converted to the format we asked for (eg HTML), but was converted to another format (eg TEXT).
Property svn:keywords set to `Author Date Id Revision`
File size: 5.1 KB

Line
1	###########################################################################
2	#
3	# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PSPlug;
27
28	use ConvertToPlug;
29	use sorttools;
30
31	sub BEGIN {
32	@ISA = ('ConvertToPlug');
33	}
34
35	sub new {
36	my $class = shift (@_);
37
38	# title_sub removes leading "Page 1" or "1 " from auto-extracted title.
39
40	my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
41
42	if (!parsargv::parse(\@_,
43	q^extract_date^, \$self->{'extract_date'},
44	q^extract_pages^, \$self->{'extract_pages'},
45	q^extract_title^, \$self->{'extract_title'},
46	"allow_extra_options")) {
47	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
48	&print_usage();
49	die "\n";
50	}
51
52	return bless $self, $class;
53	}
54
55	sub print_usage {
56	print STDERR "\n usage: plugin PSPlug [options]\n\n";
57	print STDERR " options:\n";
58	print STDERR " -extract_date Extract date from PS header\n";
59	print STDERR " -extract_pages Extract pages from PS header\n";
60	print STDERR " -extract_title Extract title from PS header\n";
61	print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
62	print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
63	}
64
65	sub get_default_block_exp {
66	my $self = shift (@_);
67
68	return q^(?i)\.(eps)$^;
69	}
70
71	sub get_default_process_exp {
72	my $self = shift (@_);
73
74	return q^(?i)\.ps$^;
75	}
76
77	sub extract_metadata_from_postscript {
78	my $self = shift (@_);
79	my $filename = shift (@_);
80	my $doc = shift (@_);
81	my $section = $doc->get_top_section();
82
83	my $title_found = 0;
84	my $pages_found = 0;
85	my $date_found = 0;
86
87	print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
88	if $self->{'verbosity'} > 1;
89
90	open(INPUT, "<$filename");
91	my $date;
92
93	while(my $line =<INPUT>) {
94	if ($self->{'extract_title'} && !$title_found) {
95	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
96	my $new_word = $word;
97	$new_word =~ s/\(Untitled\)//i;
98	$new_word =~ s/\(Microsoft Word\)//i;
99	$new_word =~ s/Microsoft Word//i;
100	$new_word =~ s/^\(//i;
101	$new_word =~ s/\)$//i;
102	$new_word =~ s/^ - //i;
103	if ($new_word ne "") {
104	$doc->add_utf8_metadata($section, "Title", $new_word );
105	$title_found = 1;
106	}
107	}
108	}
109	if ($self->{'extract_date'} && !$date_found) {
110	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
111	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
112	$date = &sorttools::format_date($2,$1,$3);
113	if (defined $date) {
114	$doc->add_utf8_metadata($section, "Date", $date );
115	}
116	}
117	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
118	$date = &sorttools::format_date($3,$2,$1);
119	if (defined $date) {
120	$doc->add_utf8_metadata($section, "Date", $date );
121	}
122	}
123	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
124	$date = &sorttools::format_date($3,$2,$1);
125	if (defined $date) {
126	$doc->add_utf8_metadata($section, "Date", $date );
127	}
128	}
129	$date_found = 1;
130	}
131	}
132	if ($self->{'extract_pages'} && !$pages_found) {
133	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
134	my $digits = $word;
135	$digits =~ s/[^0-9]//g;
136	if ($digits ne "" && $digits ne "0") {
137	$doc->add_utf8_metadata($section, "Pages", $digits );
138	$pages_found = 1;
139	}
140	}
141	}
142	}
143	}
144
145	# do plugin specific processing of doc_obj for HTML type
146	sub process {
147	my $self = shift (@_);
148	my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
149
150	my $outhandle = $self->{'outhandle'};
151	print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
152	if $self->{'verbosity'} > 1;
153
154	&extract_metadata_from_postscript($self,"$path/$file", $doc);
155
156	return ConvertToPlug::process_type($self,"ps",@_);
157	}
158
159
160	1;
161

Note: See TracBrowser for help on using the repository browser.

Download in other formats: