Context Navigation

source: trunk/gsdl/perllib/plugins/PSPlug.pm@ 13543

Last change on this file since 13543 was 12834, checked in by kjdon, 18 years ago
these convertto plugins were all setting extract_language=1 to their secondary plugins. we don't want this - only pass to secondary plugin if user has asked for it. textcat can be very slow, so don't want to run it unless we have to
Property svn:keywords set to `Author Date Id Revision`
File size: 7.9 KB

Rev	Line
[1685]	1	###########################################################################
	2	#
	3	# PSPlug.pm -- this might look VERY similar to the PDF plugin...
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[3540]	26	# 12/05/02 Added usage datastructure - John Thompson
	27
[1685]	28	package PSPlug;
	29
	30	use ConvertToPlug;
[1700]	31	use sorttools;
[1685]	32
[10254]	33	use strict;
	34	no strict 'refs'; # allow filehandles to be variables and viceversa
	35
[1685]	36	sub BEGIN {
[10254]	37	@PSPlug::ISA = ('ConvertToPlug');
[1685]	38	}
	39
[10536]	40	my $convert_to_list =
	41	[ { 'name' => "auto",
	42	'desc' => "{ConvertToPlug.convert_to.auto}" },
	43	{ 'name' => "text",
	44	'desc' => "{ConvertToPlug.convert_to.text}" },
	45	{ 'name' => "pagedimg_jpg",
	46	'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}" },
	47	{ 'name' => "pagedimg_gif",
	48	'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}" },
	49	{ 'name' => "pagedimg_png",
	50	'desc' => "{ConvertToPlug.convert_to.pagedimg_png}" }
	51	];
	52
[4744]	53	my $arguments =
[10536]	54	[ { 'name' => "convert_to",
	55	'desc' => "{ConvertToPlug.convert_to}",
	56	'type' => "enum",
	57	'reqd' => "yes",
	58	'list' => $convert_to_list,
[10888]	59	'deft' => "text" },
[10536]	60	{ 'name' => "process_exp",
[4873]	61	'desc' => "{BasPlug.process_exp}",
[6408]	62	'type' => "regexp",
[4744]	63	'deft' => &get_default_process_exp(),
	64	'reqd' => "no" },
	65	{ 'name' => "block_exp",
[4873]	66	'desc' => "{BasPlug.block_exp}",
[6408]	67	'type' => 'regexp',
[4744]	68	'deft' => &get_default_block_exp() },
	69	{ 'name' => "extract_date",
[4873]	70	'desc' => "{PSPlug.extract_date}",
[4744]	71	'type' => "flag" },
	72	{ 'name' => "extract_pages",
[4873]	73	'desc' => "{PSPlug.extract_pages}",
[4744]	74	'type' => "flag" },
	75	{ 'name' => "extract_title",
[4873]	76	'desc' => "{PSPlug.extract_title}",
[4744]	77	'type' => "flag" } ];
[3540]	78
	79	my $options = { 'name' => "PSPlug",
[5680]	80	'desc' => "{PSPlug.desc}",
[6408]	81	'abstract' => "no",
[3540]	82	'inherits' => "yes",
	83	'args' => $arguments };
	84
[1685]	85	sub new {
[10218]	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
[1685]	89
[10536]	90	#push(@$inputargs,"-convert_to");
	91	#push(@$inputargs,"text");
[10218]	92	push(@$inputargs,"-title_sub");
	93	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[10425]	94
[10218]	95	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	96	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[10425]	97
[12169]	98	my $self = new ConvertToPlug($pluginlist, $inputargs, $hashArgOptLists);
[10218]	99
[10580]	100	if ($self->{'info_only'}) {
	101	# don't worry about any options etc
	102	return bless $self, $class;
	103	}
	104
[10274]	105	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10536]	106
[10274]	107	if (!defined $secondary_plugin_options->{'TEXTPlug'}) {
	108	$secondary_plugin_options->{'TEXTPlug'} = [];
	109	}
	110
	111	my $text_options = $secondary_plugin_options->{'TEXTPlug'};
[10536]	112
	113	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) {
	114	if (!defined $secondary_plugin_options->{'PagedImgPlug'}){
	115	$secondary_plugin_options->{'PagedImgPlug'} = [];
	116	my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'};
	117	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
	118	}
	119	}
[10274]	120	# following title_sub removes "Page 1" added by ps2ascii, and a leading
	121	# "1", which is often the page number at the top of the page. Bad Luck
	122	# if your document title actually starts with "1 " - is there a better way?
	123	#$self->{'input_encoding'} = "utf8";
	124	#$self->{'extract_language'} = 1;
[10425]	125	push(@$text_options, "-input_encoding", "utf8");
[12834]	126	push(@$text_options,"-extract_language") if $self->{'extract_language'};
[10274]	127	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
	128
	129	$self = bless $self, $class;
	130
[10425]	131	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
[10274]	132
	133	return $self;
[1685]	134	}
	135
	136
	137	sub get_default_block_exp {
	138	my $self = shift (@_);
	139
	140	return q^(?i)\.(eps)$^;
	141	}
	142
	143	sub get_default_process_exp {
	144	my $self = shift (@_);
	145
	146	return q^(?i)\.ps$^;
	147	}
	148
[10274]	149	sub convert_post_process
	150	{
	151	my $self = shift (@_);
	152	my ($conv_filename) = @_;
	153
	154	my $outhandle=$self->{'outhandle'};
	155
	156	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
	157
	158	# read in file ($text will be in utf8)
	159	my $text = "";
	160	$self->read_file ($conv_filename, $encoding, $language, \$text);
	161
	162	# turn any high bytes that aren't valid utf-8 into utf-8.
	163	unicode::ensure_utf8(\$text);
	164
	165	# Write it out again!
	166	$self->utf8_write_file (\$text, $conv_filename);
	167	}
	168
[1700]	169	sub extract_metadata_from_postscript {
	170	my $self = shift (@_);
[10274]	171
	172	my ($filename,$doc) = @_;
	173
[1700]	174	my $section = $doc->get_top_section();
	175
	176	my $title_found = 0;
	177	my $pages_found = 0;
	178	my $date_found = 0;
	179
	180	print STDERR "PSPlug: extracting PostScript metadata from \"$filename\"\n"
	181	if $self->{'verbosity'} > 1;
	182
	183	open(INPUT, "<$filename");
	184	my $date;
	185
	186	while(my $line =<INPUT>) {
	187	if ($self->{'extract_title'} && !$title_found) {
	188	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
[1706]	189	my $new_word = $word;
	190	$new_word =~ s/\(Untitled\)//i;
	191	$new_word =~ s/\(Microsoft Word\)//i;
	192	$new_word =~ s/Microsoft Word//i;
	193	$new_word =~ s/^\(//i;
	194	$new_word =~ s/\)$//i;
	195	$new_word =~ s/^ - //i;
	196	if ($new_word ne "") {
	197	$doc->add_utf8_metadata($section, "Title", $new_word );
[1700]	198	$title_found = 1;
	199	}
	200	}
	201	}
	202	if ($self->{'extract_date'} && !$date_found) {
	203	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
	204	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
	205	$date = &sorttools::format_date($2,$1,$3);
[1712]	206	if (defined $date) {
	207	$doc->add_utf8_metadata($section, "Date", $date );
	208	}
[1700]	209	}
	210	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
	211	$date = &sorttools::format_date($3,$2,$1);
[1712]	212	if (defined $date) {
	213	$doc->add_utf8_metadata($section, "Date", $date );
	214	}
[1700]	215	}
	216	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
	217	$date = &sorttools::format_date($3,$2,$1);
[1712]	218	if (defined $date) {
	219	$doc->add_utf8_metadata($section, "Date", $date );
	220	}
[1700]	221	}
	222	$date_found = 1;
	223	}
	224	}
	225	if ($self->{'extract_pages'} && !$pages_found) {
	226	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
	227	my $digits = $word;
	228	$digits =~ s/[^0-9]//g;
	229	if ($digits ne "" && $digits ne "0") {
	230	$doc->add_utf8_metadata($section, "Pages", $digits );
	231	$pages_found = 1;
	232	}
	233	}
	234	}
	235	}
	236	}
	237
[1685]	238	# do plugin specific processing of doc_obj for HTML type
	239	sub process {
	240	my $self = shift (@_);
[10274]	241	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1685]	242
[10274]	243	# my $outhandle = $self->{'outhandle'};
[9465]	244
[10274]	245	# print $outhandle "PSPlug: passing $file on to $self->{'converted_to'}Plug\n"
	246	# if $self->{'verbosity'} > 1;
	247	# print STDERR "<Processing n='$file' p='PSPlug'>\n" if ($gli);
[8121]	248
[10274]	249	my $filename = &util::filename_cat($base_dir,$file);
	250	$self->extract_metadata_from_postscript($filename, $doc_obj);
	251
	252	return $self->process_type("ps",$base_dir,$file,$doc_obj);
[1685]	253	}
	254
	255
	256	1;
	257

Note: See TracBrowser for help on using the repository browser.

Download in other formats: