########################################################################### # # PSPlug.pm -- this might look VERY similar to the PDF plugin... # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # 12/05/02 Added usage datastructure - John Thompson package PSPlug; use ConvertToPlug; use sorttools; sub BEGIN { @ISA = ('ConvertToPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => 'regexp', 'deft' => &get_default_block_exp() }, { 'name' => "extract_date", 'desc' => "{PSPlug.extract_date}", 'type' => "flag" }, { 'name' => "extract_pages", 'desc' => "{PSPlug.extract_pages}", 'type' => "flag" }, { 'name' => "extract_title", 'desc' => "{PSPlug.extract_title}", 'type' => "flag" } ]; my $options = { 'name' => "PSPlug", 'desc' => "{PSPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my $class = shift (@_); # title_sub removes leading "Page 1" or "1 " from auto-extracted title. my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); $self->{'plugin_type'} = "PSPlug"; # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, q^extract_date^, \$self->{'extract_date'}, q^extract_pages^, \$self->{'extract_pages'}, q^extract_title^, \$self->{'extract_title'}, "allow_extra_options")) { print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n"; $self->print_txt_usage(""); # Use default resource bundle die "\n"; } return bless $self, $class; } sub get_default_block_exp { my $self = shift (@_); return q^(?i)\.(eps)$^; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.ps$^; } sub extract_metadata_from_postscript { my $self = shift (@_); my $filename = shift (@_); my $doc = shift (@_); my $section = $doc->get_top_section(); my $title_found = 0; my $pages_found = 0; my $date_found = 0; print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n" if $self->{'verbosity'} > 1; open(INPUT, "<$filename"); my $date; while(my $line =) { if ($self->{'extract_title'} && !$title_found) { foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) { my $new_word = $word; $new_word =~ s/\(Untitled\)//i; $new_word =~ s/\(Microsoft Word\)//i; $new_word =~ s/Microsoft Word//i; $new_word =~ s/^\(//i; $new_word =~ s/\)$//i; $new_word =~ s/^ - //i; if ($new_word ne "") { $doc->add_utf8_metadata($section, "Title", $new_word ); $title_found = 1; } } } if ($self->{'extract_date'} && !$date_found) { foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) { if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) { $date = &sorttools::format_date($2,$1,$3); if (defined $date) { $doc->add_utf8_metadata($section, "Date", $date ); } } if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) { $date = &sorttools::format_date($3,$2,$1); if (defined $date) { $doc->add_utf8_metadata($section, "Date", $date ); } } if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) { $date = &sorttools::format_date($3,$2,$1); if (defined $date) { $doc->add_utf8_metadata($section, "Date", $date ); } } $date_found = 1; } } if ($self->{'extract_pages'} && !$pages_found) { foreach my $word ($line =~ m/(Pages: [0-9]*)/g) { my $digits = $word; $digits =~ s/[^0-9]//g; if ($digits ne "" && $digits ne "0") { $doc->add_utf8_metadata($section, "Pages", $digits ); $pages_found = 1; } } } } } # do plugin specific processing of doc_obj for HTML type sub process { my $self = shift (@_); my ($trash, $trash2, $path, $file, $trash3, $doc) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle "PSPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" if $self->{'verbosity'} > 1; &extract_metadata_from_postscript($self,"$path/$file", $doc); #$doc->add_utf8_metadata($doc->get_top_section(), "PSPlug", 1); return ConvertToPlug::process_type($self,"ps",@_); } 1;