source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 9465

Last change on this file since 9465 was 9465, checked in by kjdon, 19 years ago

ConvertToPlug now accepts the gli arg in read() and passes it in process(), and all the convert plugins now print a processing message for gli in their process() method.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# 12/05/02 Added usage datastructure - John Thompson
27
28package PSPlug;
29
30use ConvertToPlug;
31use sorttools;
32
33sub BEGIN {
34 @ISA = ('ConvertToPlug');
35}
36
37my $arguments =
38 [ { 'name' => "process_exp",
39 'desc' => "{BasPlug.process_exp}",
40 'type' => "regexp",
41 'deft' => &get_default_process_exp(),
42 'reqd' => "no" },
43 { 'name' => "block_exp",
44 'desc' => "{BasPlug.block_exp}",
45 'type' => 'regexp',
46 'deft' => &get_default_block_exp() },
47 { 'name' => "extract_date",
48 'desc' => "{PSPlug.extract_date}",
49 'type' => "flag" },
50 { 'name' => "extract_pages",
51 'desc' => "{PSPlug.extract_pages}",
52 'type' => "flag" },
53 { 'name' => "extract_title",
54 'desc' => "{PSPlug.extract_title}",
55 'type' => "flag" } ];
56
57my $options = { 'name' => "PSPlug",
58 'desc' => "{PSPlug.desc}",
59 'abstract' => "no",
60 'inherits' => "yes",
61 'args' => $arguments };
62
63sub new {
64 my $class = shift (@_);
65
66 # title_sub removes leading "Page 1" or "1 " from auto-extracted title.
67
68 my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
69 $self->{'plugin_type'} = "PSPlug";
70 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
71 my $option_list = $self->{'option_list'};
72 push( @{$option_list}, $options );
73
74 if (!parsargv::parse(\@_,
75 q^extract_date^, \$self->{'extract_date'},
76 q^extract_pages^, \$self->{'extract_pages'},
77 q^extract_title^, \$self->{'extract_title'},
78 "allow_extra_options")) {
79 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
80 $self->print_txt_usage(""); # Use default resource bundle
81 die "\n";
82 }
83
84 return bless $self, $class;
85}
86
87
88sub get_default_block_exp {
89 my $self = shift (@_);
90
91 return q^(?i)\.(eps)$^;
92}
93
94sub get_default_process_exp {
95 my $self = shift (@_);
96
97 return q^(?i)\.ps$^;
98}
99
100sub extract_metadata_from_postscript {
101 my $self = shift (@_);
102 my $filename = shift (@_);
103 my $doc = shift (@_);
104 my $section = $doc->get_top_section();
105
106 my $title_found = 0;
107 my $pages_found = 0;
108 my $date_found = 0;
109
110 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
111 if $self->{'verbosity'} > 1;
112
113 open(INPUT, "<$filename");
114 my $date;
115
116 while(my $line =<INPUT>) {
117 if ($self->{'extract_title'} && !$title_found) {
118 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
119 my $new_word = $word;
120 $new_word =~ s/\(Untitled\)//i;
121 $new_word =~ s/\(Microsoft Word\)//i;
122 $new_word =~ s/Microsoft Word//i;
123 $new_word =~ s/^\(//i;
124 $new_word =~ s/\)$//i;
125 $new_word =~ s/^ - //i;
126 if ($new_word ne "") {
127 $doc->add_utf8_metadata($section, "Title", $new_word );
128 $title_found = 1;
129 }
130 }
131 }
132 if ($self->{'extract_date'} && !$date_found) {
133 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
134 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
135 $date = &sorttools::format_date($2,$1,$3);
136 if (defined $date) {
137 $doc->add_utf8_metadata($section, "Date", $date );
138 }
139 }
140 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
141 $date = &sorttools::format_date($3,$2,$1);
142 if (defined $date) {
143 $doc->add_utf8_metadata($section, "Date", $date );
144 }
145 }
146 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
147 $date = &sorttools::format_date($3,$2,$1);
148 if (defined $date) {
149 $doc->add_utf8_metadata($section, "Date", $date );
150 }
151 }
152 $date_found = 1;
153 }
154 }
155 if ($self->{'extract_pages'} && !$pages_found) {
156 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
157 my $digits = $word;
158 $digits =~ s/[^0-9]//g;
159 if ($digits ne "" && $digits ne "0") {
160 $doc->add_utf8_metadata($section, "Pages", $digits );
161 $pages_found = 1;
162 }
163 }
164 }
165 }
166}
167
168# do plugin specific processing of doc_obj for HTML type
169sub process {
170 my $self = shift (@_);
171 my ($trash, $trash2, $path, $file, $trash3, $doc, $gli) = @_;
172
173 my $outhandle = $self->{'outhandle'};
174 print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
175 if $self->{'verbosity'} > 1;
176 print STDERR "<Processing n='$_[3]' p='PDFPlug'>\n" if ($gli);
177
178 &extract_metadata_from_postscript($self,"$path/$file", $doc);
179
180 return ConvertToPlug::process_type($self,"ps",@_);
181}
182
183
1841;
185
Note: See TracBrowser for help on using the repository browser.