source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 5924

Last change on this file since 5924 was 5924, checked in by kjdon, 20 years ago

changed the new metadata to eg WordPlug instead of Word, cos a clash with Image

  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33sub BEGIN {
34 @ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "string",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "block_exp",
44 'desc' => "{BasPlug.block_exp}",
45 'type' => 'string',
46 'deft' => &get_default_block_exp() },
47 { 'name' => "extract_date",
48 'desc' => "{PSPlug.extract_date}",
49 'type' => "flag" },
50 { 'name' => "extract_pages",
51 'desc' => "{PSPlug.extract_pages}",
52 'type' => "flag" },
53 { 'name' => "extract_title",
54 'desc' => "{PSPlug.extract_title}",
55 'type' => "flag" } ];
56
57my $options = { 'name' => "PSPlug",
58 'desc' => "{PSPlug.desc}",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62sub new {
63 my $class = shift (@_);
64
65 # title_sub removes leading "Page 1" or "1 " from auto-extracted title.
66
67 my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
68 $self->{'plugin_type'} = "PSPlug";
69 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
70 my $option_list = $self->{'option_list'};
71 push( @{$option_list}, $options );
72
73 if (!parsargv::parse(\@_,
74 q^extract_date^, \$self->{'extract_date'},
75 q^extract_pages^, \$self->{'extract_pages'},
76 q^extract_title^, \$self->{'extract_title'},
77 "allow_extra_options")) {
78 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
79 $self->print_txt_usage(""); # Use default resource bundle
80 die "\n";
81 }
82
83 return bless $self, $class;
84}
85
86# sub print_usage {
87# print STDERR "\n usage: plugin PSPlug [options]\n\n";
88# print STDERR " options:\n";
89# print STDERR " -extract_date Extract date from PS header\n";
90# print STDERR " -extract_pages Extract pages from PS header\n";
91# print STDERR " -extract_title Extract title from PS header\n";
92# print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
93# print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
94# }
95
96sub get_default_block_exp {
97 my $self = shift (@_);
98
99 return q^(?i)\.(eps)$^;
100}
101
102sub get_default_process_exp {
103 my $self = shift (@_);
104
105 return q^(?i)\.ps$^;
106}
107
108sub extract_metadata_from_postscript {
109 my $self = shift (@_);
110 my $filename = shift (@_);
111 my $doc = shift (@_);
112 my $section = $doc->get_top_section();
113
114 my $title_found = 0;
115 my $pages_found = 0;
116 my $date_found = 0;
117
118 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
119 if $self->{'verbosity'} > 1;
120
121 open(INPUT, "<$filename");
122 my $date;
123
124 while(my $line =<INPUT>) {
125 if ($self->{'extract_title'} && !$title_found) {
126 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
127 my $new_word = $word;
128 $new_word =~ s/\(Untitled\)//i;
129 $new_word =~ s/\(Microsoft Word\)//i;
130 $new_word =~ s/Microsoft Word//i;
131 $new_word =~ s/^\(//i;
132 $new_word =~ s/\)$//i;
133 $new_word =~ s/^ - //i;
134 if ($new_word ne "") {
135 $doc->add_utf8_metadata($section, "Title", $new_word );
136 $title_found = 1;
137 }
138 }
139 }
140 if ($self->{'extract_date'} && !$date_found) {
141 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
142 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
143 $date = &sorttools::format_date($2,$1,$3);
144 if (defined $date) {
145 $doc->add_utf8_metadata($section, "Date", $date );
146 }
147 }
148 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
149 $date = &sorttools::format_date($3,$2,$1);
150 if (defined $date) {
151 $doc->add_utf8_metadata($section, "Date", $date );
152 }
153 }
154 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
155 $date = &sorttools::format_date($3,$2,$1);
156 if (defined $date) {
157 $doc->add_utf8_metadata($section, "Date", $date );
158 }
159 }
160 $date_found = 1;
161 }
162 }
163 if ($self->{'extract_pages'} && !$pages_found) {
164 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
165 my $digits = $word;
166 $digits =~ s/[^0-9]//g;
167 if ($digits ne "" && $digits ne "0") {
168 $doc->add_utf8_metadata($section, "Pages", $digits );
169 $pages_found = 1;
170 }
171 }
172 }
173 }
174}
175
176# do plugin specific processing of doc_obj for HTML type
177sub process {
178 my $self = shift (@_);
179 my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
180
181 my $outhandle = $self->{'outhandle'};
182 print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
183 if $self->{'verbosity'} > 1;
184
185 &extract_metadata_from_postscript($self,"$path/$file", $doc);
186 #$doc->add_utf8_metadata($doc->get_top_section(), "PSPlug", 1);
187 return ConvertToPlug::process_type($self,"ps",@_);
188}
189
190
1911;
192
Note: See TracBrowser for help on using the repository browser.