source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 1742

Last change on this file since 1742 was 1742, checked in by jrm21, 23 years ago

Added a comment to the usage stuff about PRESCRIPT.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1###########################################################################
2#
3# PSPlug.pm -- this might look VERY similar to the PDF plugin...
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package PSPlug;
27
28use ConvertToPlug;
29use sorttools;
30
31sub BEGIN {
32 @ISA = ('ConvertToPlug');
33}
34
35use strict;
36
37
38sub new {
39 my $class = shift (@_);
40
41 my $self = new ConvertToPlug ($class, ("-convert_to","text",@_), "--", "-title_sub", 'Page\s+\d+');
42
43 if (!parsargv::parse(\@_,
44 q^extract_date^, \$self->{'extract_date'},
45 q^extract_pages^, \$self->{'extract_pages'},
46 q^extract_title^, \$self->{'extract_title'},
47 "allow_extra_options")) {
48 print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
49 &print_usage();
50 die "\n";
51 }
52
53 return bless $self, $class;
54}
55
56sub print_usage {
57 print STDERR "\n usage: plugin PSPlug [options]\n\n";
58 print STDERR " options:\n";
59 print STDERR " -extract_date Extract date from PS header\n";
60 print STDERR " -extract_pages Extract pages from PS header\n";
61 print STDERR " -extract_title Extract title from PS header\n";
62 print STDERR "\n\nNote! This is a \"poor man's\" ps to text converter. If you are serious, consider\n";
63 print STDERR "using the PRESCRIPT package, which is available for download at\n http://www.nzdl.org/html/software.html\n\n";
64}
65
66
67sub get_default_block_exp {
68 my $self = shift (@_);
69
70 return q^(?i)\.(eps)$^;
71}
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75
76 return q^(?i)\.ps$^;
77}
78
79sub extract_metadata_from_postscript {
80 my $self = shift (@_);
81 my $filename = shift (@_);
82 my $doc = shift (@_);
83 my $section = $doc->get_top_section();
84
85 my $title_found = 0;
86 my $pages_found = 0;
87 my $date_found = 0;
88
89 print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
90 if $self->{'verbosity'} > 1;
91
92 open(INPUT, "<$filename");
93 my $date;
94
95 while(my $line =<INPUT>) {
96 if ($self->{'extract_title'} && !$title_found) {
97 foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
98 my $new_word = $word;
99 $new_word =~ s/\(Untitled\)//i;
100 $new_word =~ s/\(Microsoft Word\)//i;
101 $new_word =~ s/Microsoft Word//i;
102 $new_word =~ s/^\(//i;
103 $new_word =~ s/\)$//i;
104 $new_word =~ s/^ - //i;
105 if ($new_word ne "") {
106 $doc->add_utf8_metadata($section, "Title", $new_word );
107 $title_found = 1;
108 }
109 }
110 }
111 if ($self->{'extract_date'} && !$date_found) {
112 foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
113 if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
114 $date = &sorttools::format_date($2,$1,$3);
115 if (defined $date) {
116 $doc->add_utf8_metadata($section, "Date", $date );
117 }
118 }
119 if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
120 $date = &sorttools::format_date($3,$2,$1);
121 if (defined $date) {
122 $doc->add_utf8_metadata($section, "Date", $date );
123 }
124 }
125 if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
126 $date = &sorttools::format_date($3,$2,$1);
127 if (defined $date) {
128 $doc->add_utf8_metadata($section, "Date", $date );
129 }
130 }
131 $date_found = 1;
132 }
133 }
134 if ($self->{'extract_pages'} && !$pages_found) {
135 foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
136 my $digits = $word;
137 $digits =~ s/[^0-9]//g;
138 if ($digits ne "" && $digits ne "0") {
139 $doc->add_utf8_metadata($section, "Pages", $digits );
140 $pages_found = 1;
141 }
142 }
143 }
144 }
145}
146
147
148# do plugin specific processing of doc_obj for HTML type
149sub process {
150 my $self = shift (@_);
151 my ($trash, $trash2, $path, $file, $trash3, $doc) = @_;
152
153 print STDERR "PSPlug: passing $_[3] onto $self->{'convert_to'} Plug\n"
154 if $self->{'verbosity'} > 1;
155
156 &extract_metadata_from_postscript($self,"$path/$file", $doc);
157
158 return ConvertToPlug::process_type($self,"ps",@_);
159}
160
161
1621;
163
Note: See TracBrowser for help on using the repository browser.