source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33sub BEGIN {
34 @ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "block_exp",
44 'desc' => "{BasPlug.block_exp}",
45 'type' => 'regexp',
46 'deft' => &get_default_block_exp() },
47 { 'name' => "extract_date",
48 'desc' => "{PSPlug.extract_date}",
49 'type' => "flag" },
50 { 'name' => "extract_pages",
51 'desc' => "{PSPlug.extract_pages}",
52 'type' => "flag" },
53 { 'name' => "extract_title",
54 'desc' => "{PSPlug.extract_title}",
55 'type' => "flag" } ];
56
57my $options = { 'name' => "PSPlug",
58 'desc' => "{PSPlug.desc}",
59 'abstract' => "no",
60 'inherits' => "yes",
61 'args' => $arguments };
62
63sub new {
64 my $class = shift (@_);
65
66 # title_sub removes leading "Page 1" or "1 " from auto-extracted title.
67
68 my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
69 $self->{'plugin_type'} = "PSPlug";
70 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
71 my $option_list = $self->{'option_list'};
72 push( @{$option_list}, $options );
73
74 if (!parsargv::parse(\@_,
75 q^extract_date^, \$self->{'extract_date'},
76 q^extract_pages^, \$self->{'extract_pages'},
77 q^extract_title^, \$self->{'extract_title'},
78 "allow_extra_options")) {
79 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
80 $self->print_txt_usage(""); # Use default resource bundle
81 die "\n";
82 }
83
84 return bless $self, $class;
85}
86
87# sub print_usage {
88# print STDERR "\n usage: plugin PSPlug [options]\n\n";
89# print STDERR " options:\n";
90# print STDERR " -extract_date Extract date from PS header\n";
91# print STDERR " -extract_pages Extract pages from PS header\n";
92# print STDERR " -extract_title Extract title from PS header\n";
93# print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
94# print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
95# }
96
97sub get_default_block_exp {
98 my $self = shift (@_);
99
100 return q^(?i)\.(eps)$^;
101}
102
103sub get_default_process_exp {
104 my $self = shift (@_);
105
106 return q^(?i)\.ps$^;
107}
108
109sub extract_metadata_from_postscript {
110 my $self = shift (@_);
111 my $filename = shift (@_);
112 my $doc = shift (@_);
113 my $section = $doc->get_top_section();
114
115 my $title_found = 0;
116 my $pages_found = 0;
117 my $date_found = 0;
118
119 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
120 if $self->{'verbosity'} > 1;
121
122 open(INPUT, "<$filename");
123 my $date;
124
125 while(my $line =<INPUT>) {
126 if ($self->{'extract_title'} && !$title_found) {
127 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
128 my $new_word = $word;
129 $new_word =~ s/\(Untitled\)//i;
130 $new_word =~ s/\(Microsoft Word\)//i;
131 $new_word =~ s/Microsoft Word//i;
132 $new_word =~ s/^\(//i;
133 $new_word =~ s/\)$//i;
134 $new_word =~ s/^ - //i;
135 if ($new_word ne "") {
136 $doc->add_utf8_metadata($section, "Title", $new_word );
137 $title_found = 1;
138 }
139 }
140 }
141 if ($self->{'extract_date'} && !$date_found) {
142 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
143 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
144 $date = &sorttools::format_date($2,$1,$3);
145 if (defined $date) {
146 $doc->add_utf8_metadata($section, "Date", $date );
147 }
148 }
149 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
150 $date = &sorttools::format_date($3,$2,$1);
151 if (defined $date) {
152 $doc->add_utf8_metadata($section, "Date", $date );
153 }
154 }
155 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
156 $date = &sorttools::format_date($3,$2,$1);
157 if (defined $date) {
158 $doc->add_utf8_metadata($section, "Date", $date );
159 }
160 }
161 $date_found = 1;
162 }
163 }
164 if ($self->{'extract_pages'} && !$pages_found) {
165 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
166 my $digits = $word;
167 $digits =~ s/[^0-9]//g;
168 if ($digits ne "" && $digits ne "0") {
169 $doc->add_utf8_metadata($section, "Pages", $digits );
170 $pages_found = 1;
171 }
172 }
173 }
174 }
175}
176
177# do plugin specific processing of doc_obj for HTML type
178sub process {
179 my $self = shift (@_);
180 my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
181
182 my $outhandle = $self->{'outhandle'};
183 print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
184 if $self->{'verbosity'} > 1;
185
186 &extract_metadata_from_postscript($self,"$path/$file", $doc);
187 #$doc->add_utf8_metadata($doc->get_top_section(), "PSPlug", 1);
188 return ConvertToPlug::process_type($self,"ps",@_);
189}
190
191
1921;
193
Note: See TracBrowser for help on using the repository browser.