###########################################################################
#
# PSPlug.pm -- this might look VERY similar to the PDF plugin...
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# 12/05/02 Added usage datastructure - John Thompson
package PSPlug;
use ConvertToPlug;
use sorttools;
use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa
sub BEGIN {
@PSPlug::ISA = ('ConvertToPlug');
}
my $convert_to_list =
[ { 'name' => "auto",
'desc' => "{ConvertToPlug.convert_to.auto}" },
{ 'name' => "text",
'desc' => "{ConvertToPlug.convert_to.text}" },
{ 'name' => "pagedimg_jpg",
'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}" },
{ 'name' => "pagedimg_gif",
'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}" },
{ 'name' => "pagedimg_png",
'desc' => "{ConvertToPlug.convert_to.pagedimg_png}" }
];
my $arguments =
[ { 'name' => "convert_to",
'desc' => "{ConvertToPlug.convert_to}",
'type' => "enum",
'reqd' => "yes",
'list' => $convert_to_list,
'deft' => "text" },
{ 'name' => "process_exp",
'desc' => "{BasPlug.process_exp}",
'type' => "regexp",
'deft' => &get_default_process_exp(),
'reqd' => "no" },
{ 'name' => "block_exp",
'desc' => "{BasPlug.block_exp}",
'type' => 'regexp',
'deft' => &get_default_block_exp() },
{ 'name' => "extract_date",
'desc' => "{PSPlug.extract_date}",
'type' => "flag" },
{ 'name' => "extract_pages",
'desc' => "{PSPlug.extract_pages}",
'type' => "flag" },
{ 'name' => "extract_title",
'desc' => "{PSPlug.extract_title}",
'type' => "flag" } ];
my $options = { 'name' => "PSPlug",
'desc' => "{PSPlug.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
#push(@$inputargs,"-convert_to");
#push(@$inputargs,"text");
push(@$inputargs,"-title_sub");
push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs);
if ($self->{'info_only'}) {
# don't worry about any options etc
return bless $self, $class;
}
my $secondary_plugin_options = $self->{'secondary_plugin_options'};
if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
$secondary_plugin_options->{'TEXTPlug'} = [];
}
my $text_options = $secondary_plugin_options->{'TEXTPlug'};
if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
$secondary_plugin_options->{'PagedImgPlug'} = [];
my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
}
}
# following title_sub removes "Page 1" added by ps2ascii, and a leading
# "1", which is often the page number at the top of the page. Bad Luck
# if your document title actually starts with "1 " - is there a better way?
#$self->{'input_encoding'} = "utf8";
#$self->{'extract_language'} = 1;
push(@$text_options, "-input_encoding", "utf8");
push(@$text_options,"-extract_language");
push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
$self = bless $self, $class;
$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
return $self;
}
sub get_default_block_exp {
my $self = shift (@_);
return q^(?i)\.(eps)$^;
}
sub get_default_process_exp {
my $self = shift (@_);
return q^(?i)\.ps$^;
}
sub convert_post_process
{
my $self = shift (@_);
my ($conv_filename) = @_;
my $outhandle=$self->{'outhandle'};
my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
# read in file ($text will be in utf8)
my $text = "";
$self->read_file ($conv_filename, $encoding, $language, \$text);
# turn any high bytes that aren't valid utf-8 into utf-8.
unicode::ensure_utf8(\$text);
# Write it out again!
$self->utf8_write_file (\$text, $conv_filename);
}
sub extract_metadata_from_postscript {
my $self = shift (@_);
my ($filename,$doc) = @_;
my $section = $doc->get_top_section();
my $title_found = 0;
my $pages_found = 0;
my $date_found = 0;
print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
if $self->{'verbosity'} > 1;
open(INPUT, "<$filename");
my $date;
while(my $line =) {
if ($self->{'extract_title'} && !$title_found) {
foreach my $word ($line =~ m|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)|g) {
my $new_word = $word;
$new_word =~ s/\(Untitled\)//i;
$new_word =~ s/\(Microsoft Word\)//i;
$new_word =~ s/Microsoft Word//i;
$new_word =~ s/^\(//i;
$new_word =~ s/\)$//i;
$new_word =~ s/^ - //i;
if ($new_word ne "") {
$doc->add_utf8_metadata($section, "Title", $new_word );
$title_found = 1;
}
}
}
if ($self->{'extract_date'} && !$date_found) {
foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
$date = &sorttools::format_date($2,$1,$3);
if (defined $date) {
$doc->add_utf8_metadata($section, "Date", $date );
}
}
if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
$date = &sorttools::format_date($3,$2,$1);
if (defined $date) {
$doc->add_utf8_metadata($section, "Date", $date );
}
}
if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
$date = &sorttools::format_date($3,$2,$1);
if (defined $date) {
$doc->add_utf8_metadata($section, "Date", $date );
}
}
$date_found = 1;
}
}
if ($self->{'extract_pages'} && !$pages_found) {
foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
my $digits = $word;
$digits =~ s/[^0-9]//g;
if ($digits ne "" && $digits ne "0") {
$doc->add_utf8_metadata($section, "Pages", $digits );
$pages_found = 1;
}
}
}
}
}
# do plugin specific processing of doc_obj for HTML type
sub process {
my $self = shift (@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
# my $outhandle = $self->{'outhandle'};
# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
# if $self->{'verbosity'} > 1;
# print STDERR "\n" if ($gli);
my $filename = &util::filename_cat($base_dir,$file);
$self->extract_metadata_from_postscript($filename, $doc_obj);
return $self->process_type("ps",$base_dir,$file,$doc_obj);
}
1;