source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 2996

Last change on this file since 2996 was 2979, checked in by jrm21, 22 years ago

Use self->converted_to instead of convert_to, in case the file could not be
converted to the format we asked for (eg HTML), but was converted to another
format (eg TEXT).

  • Property svn:keywords set to Author Date Id Revision
File size: 5.1 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PSPlug;
27
28use ConvertToPlug;
29use sorttools;
30
31sub BEGIN {
32 @ISA = ('ConvertToPlug');
33}
34
35sub new {
36 my $class = shift (@_);
37
38 # title_sub removes leading "Page 1" or "1 " from auto-extracted title.
39
40 my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
41
42 if (!parsargv::parse(\@_,
43 q^extract_date^, \$self->{'extract_date'},
44 q^extract_pages^, \$self->{'extract_pages'},
45 q^extract_title^, \$self->{'extract_title'},
46 "allow_extra_options")) {
47 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
48 &print_usage();
49 die "\n";
50 }
51
52 return bless $self, $class;
53}
54
55sub print_usage {
56 print STDERR "\n usage: plugin PSPlug [options]\n\n";
57 print STDERR " options:\n";
58 print STDERR " -extract_date Extract date from PS header\n";
59 print STDERR " -extract_pages Extract pages from PS header\n";
60 print STDERR " -extract_title Extract title from PS header\n";
61 print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
62 print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
63}
64
65sub get_default_block_exp {
66 my $self = shift (@_);
67
68 return q^(?i)\.(eps)$^;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)\.ps$^;
75}
76
77sub extract_metadata_from_postscript {
78 my $self = shift (@_);
79 my $filename = shift (@_);
80 my $doc = shift (@_);
81 my $section = $doc->get_top_section();
82
83 my $title_found = 0;
84 my $pages_found = 0;
85 my $date_found = 0;
86
87 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
88 if $self->{'verbosity'} > 1;
89
90 open(INPUT, "<$filename");
91 my $date;
92
93 while(my $line =<INPUT>) {
94 if ($self->{'extract_title'} && !$title_found) {
95 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
96 my $new_word = $word;
97 $new_word =~ s/\(Untitled\)//i;
98 $new_word =~ s/\(Microsoft Word\)//i;
99 $new_word =~ s/Microsoft Word//i;
100 $new_word =~ s/^\(//i;
101 $new_word =~ s/\)$//i;
102 $new_word =~ s/^ - //i;
103 if ($new_word ne "") {
104 $doc->add_utf8_metadata($section, "Title", $new_word );
105 $title_found = 1;
106 }
107 }
108 }
109 if ($self->{'extract_date'} && !$date_found) {
110 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
111 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
112 $date = &sorttools::format_date($2,$1,$3);
113 if (defined $date) {
114 $doc->add_utf8_metadata($section, "Date", $date );
115 }
116 }
117 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
118 $date = &sorttools::format_date($3,$2,$1);
119 if (defined $date) {
120 $doc->add_utf8_metadata($section, "Date", $date );
121 }
122 }
123 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
124 $date = &sorttools::format_date($3,$2,$1);
125 if (defined $date) {
126 $doc->add_utf8_metadata($section, "Date", $date );
127 }
128 }
129 $date_found = 1;
130 }
131 }
132 if ($self->{'extract_pages'} && !$pages_found) {
133 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
134 my $digits = $word;
135 $digits =~ s/[^0-9]//g;
136 if ($digits ne "" && $digits ne "0") {
137 $doc->add_utf8_metadata($section, "Pages", $digits );
138 $pages_found = 1;
139 }
140 }
141 }
142 }
143}
144
145# do plugin specific processing of doc_obj for HTML type
146sub process {
147 my $self = shift (@_);
148 my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
149
150 my $outhandle = $self->{'outhandle'};
151 print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
152 if $self->{'verbosity'} > 1;
153
154 &extract_metadata_from_postscript($self,"$path/$file", $doc);
155
156 return ConvertToPlug::process_type($self,"ps",@_);
157}
158
159
1601;
161
Note: See TracBrowser for help on using the repository browser.