Context Navigation

source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

Property svn:keywords set to Author Date Id Revision

File size: 5.1 KB

Line
1	###########################################################################
2	#
3	# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package PSPlug;
27
28	use ConvertToPlug;
29	use sorttools;
30
31	sub BEGIN {
32	@ISA = ('ConvertToPlug');
33	}
34
35	sub new {
36	my $class = shift (@_);
37
38	# title_sub removes leading "Page 1" or "1 " from auto-extracted title.
39
40	my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
41
42	if (!parsargv::parse(\@_,
43	q^extract_date^, \$self->{'extract_date'},
44	q^extract_pages^, \$self->{'extract_pages'},
45	q^extract_title^, \$self->{'extract_title'},
46	"allow_extra_options")) {
47	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
48	&print_usage();
49	die "\n";
50	}
51
52	return bless $self, $class;
53	}
54
55	sub print_usage {
56	print STDERR "\n usage: plugin PSPlug [options]\n\n";
57	print STDERR " options:\n";
58	print STDERR " -extract_date Extract date from PS header\n";
59	print STDERR " -extract_pages Extract pages from PS header\n";
60	print STDERR " -extract_title Extract title from PS header\n";
61	print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
62	print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
63	}
64
65	sub get_default_block_exp {
66	my $self = shift (@_);
67
68	return q^(?i)\.(eps)$^;
69	}
70
71	sub get_default_process_exp {
72	my $self = shift (@_);
73
74	return q^(?i)\.ps$^;
75	}
76
77	sub extract_metadata_from_postscript {
78	my $self = shift (@_);
79	my $filename = shift (@_);
80	my $doc = shift (@_);
81	my $section = $doc->get_top_section();
82
83	my $title_found = 0;
84	my $pages_found = 0;
85	my $date_found = 0;
86
87	print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
88	if $self->{'verbosity'} > 1;
89
90	open(INPUT, "<$filename");
91	my $date;
92
93	while(my $line =<INPUT>) {
94	if ($self->{'extract_title'} && !$title_found) {
95	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
96	my $new_word = $word;
97	$new_word =~ s/\(Untitled\)//i;
98	$new_word =~ s/\(Microsoft Word\)//i;
99	$new_word =~ s/Microsoft Word//i;
100	$new_word =~ s/^\(//i;
101	$new_word =~ s/\)$//i;
102	$new_word =~ s/^ - //i;
103	if ($new_word ne "") {
104	$doc->add_utf8_metadata($section, "Title", $new_word );
105	$title_found = 1;
106	}
107	}
108	}
109	if ($self->{'extract_date'} && !$date_found) {
110	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
111	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
112	$date = &sorttools::format_date($2,$1,$3);
113	if (defined $date) {
114	$doc->add_utf8_metadata($section, "Date", $date );
115	}
116	}
117	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
118	$date = &sorttools::format_date($3,$2,$1);
119	if (defined $date) {
120	$doc->add_utf8_metadata($section, "Date", $date );
121	}
122	}
123	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
124	$date = &sorttools::format_date($3,$2,$1);
125	if (defined $date) {
126	$doc->add_utf8_metadata($section, "Date", $date );
127	}
128	}
129	$date_found = 1;
130	}
131	}
132	if ($self->{'extract_pages'} && !$pages_found) {
133	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
134	my $digits = $word;
135	$digits =~ s/[^0-9]//g;
136	if ($digits ne "" && $digits ne "0") {
137	$doc->add_utf8_metadata($section, "Pages", $digits );
138	$pages_found = 1;
139	}
140	}
141	}
142	}
143	}
144
145	# do plugin specific processing of doc_obj for HTML type
146	sub process {
147	my $self = shift (@_);
148	my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
149
150	my $outhandle = $self->{'outhandle'};
151	print $outhandle "PSPlug: passing $_[3] on to $self->{'convert_to'}Plug\n"
152	if $self->{'verbosity'} > 1;
153
154	&extract_metadata_from_postscript($self,"$path/$file", $doc);
155
156	return ConvertToPlug::process_type($self,"ps",@_);
157	}
158
159
160	1;
161

Note: See TracBrowser for help on using the repository browser.

Download in other formats: