Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PostScriptPlugin.pm@ 22860

Last change on this file since 22860 was 22860, checked in by kjdon, 14 years ago
changed a line
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Rev	Line
[1685]	1	###########################################################################
	2	#
[17725]	3	# PostScriptPlugin.pm -- plugin to process PostScript files
[1685]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[17725]	26	package PostScriptPlugin;
[3540]	27
[15872]	28	use ConvertBinaryFile;
	29	use ReadTextFile; # for read_file in convert_post_process. do we need it?
[1700]	30	use sorttools;
[1685]	31
[10254]	32	use strict;
	33	no strict 'refs'; # allow filehandles to be variables and viceversa
	34
[1685]	35	sub BEGIN {
[17725]	36	@PostScriptPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
[1685]	37	}
	38
[10536]	39	my $convert_to_list =
	40	[ { 'name' => "auto",
[15872]	41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10536]	42	{ 'name' => "text",
[15872]	43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10536]	44	{ 'name' => "pagedimg_jpg",
[15872]	45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}" },
[10536]	46	{ 'name' => "pagedimg_gif",
[15872]	47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}" },
[10536]	48	{ 'name' => "pagedimg_png",
[15872]	49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}" }
[10536]	50	];
	51
[4744]	52	my $arguments =
[10536]	53	[ { 'name' => "convert_to",
[15872]	54	'desc' => "{ConvertBinaryFile.convert_to}",
[10536]	55	'type' => "enum",
	56	'reqd' => "yes",
	57	'list' => $convert_to_list,
[10888]	58	'deft' => "text" },
[10536]	59	{ 'name' => "process_exp",
[15872]	60	'desc' => "{BasePlugin.process_exp}",
[6408]	61	'type' => "regexp",
[4744]	62	'deft' => &get_default_process_exp(),
	63	'reqd' => "no" },
	64	{ 'name' => "block_exp",
[15872]	65	'desc' => "{BasePlugin.block_exp}",
[6408]	66	'type' => 'regexp',
[4744]	67	'deft' => &get_default_block_exp() },
	68	{ 'name' => "extract_date",
[17744]	69	'desc' => "{PostScriptPlugin.extract_date}",
[4744]	70	'type' => "flag" },
	71	{ 'name' => "extract_pages",
[17744]	72	'desc' => "{PostScriptPlugin.extract_pages}",
[4744]	73	'type' => "flag" },
	74	{ 'name' => "extract_title",
[17744]	75	'desc' => "{PostScriptPlugin.extract_title}",
[4744]	76	'type' => "flag" } ];
[3540]	77
[17725]	78	my $options = { 'name' => "PostScriptPlugin",
[17744]	79	'desc' => "{PostScriptPlugin.desc}",
[6408]	80	'abstract' => "no",
[3540]	81	'inherits' => "yes",
[15114]	82	'srcreplaceable' => "yes", # Source docs in postscript format can be replaced with GS-generated html
[3540]	83	'args' => $arguments };
	84
[1685]	85	sub new {
[10218]	86	my ($class) = shift (@_);
	87	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	88	push(@$pluginlist, $class);
[1685]	89
[10218]	90	push(@$inputargs,"-title_sub");
	91	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[10425]	92
[15872]	93	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	94	push(@{$hashArgOptLists->{"OptList"}},$options);
[10425]	95
[15872]	96	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	97
[10580]	98	if ($self->{'info_only'}) {
	99	# don't worry about any options etc
	100	return bless $self, $class;
	101	}
	102
[15872]	103	$self->{'filename_extension'} = "ps";
	104	$self->{'file_type'} = "PS";
	105
[22597]	106	if ($self->{'convert_to'} eq "auto") {
	107	$self->{'convert_to'} = "text";
	108	}
	109
	110	# set convert_to_plugin and convert_to_ext
[22860]	111	$self->set_standard_convert_settings();
[22597]	112	my $secondary_plugin_name = $self->{'convert_to_plugin'};
[10274]	113	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10536]	114
[22597]	115	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
	116	$secondary_plugin_options->{$secondary_plugin_name} = [];
[10274]	117	}
[22597]	118	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10274]	119
	120	# following title_sub removes "Page 1" added by ps2ascii, and a leading
	121	# "1", which is often the page number at the top of the page. Bad Luck
	122	# if your document title actually starts with "1 " - is there a better way?
[22597]	123	push(@$specific_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
	124	push(@$specific_options, "-file_rename_method", "none");
[18406]	125
[22597]	126	if ($secondary_plugin_name eq "TextPlugin") {
	127	push(@$specific_options, "-input_encoding", "utf8");
	128	push(@$specific_options,"-extract_language") if $self->{'extract_language'};
	129	} elsif ($secondary_plugin_name eq "PagedImagePlugin") {
	130	push(@$specific_options, "-processing_tmp_files");
	131	}
[20790]	132
[10274]	133	$self = bless $self, $class;
[17330]	134	# used for convert_post_process
	135	$self->{'input_encoding'} = "auto";
	136	$self->{'default_encoding'} = "utf8";
[10274]	137
[10425]	138	$self->load_secondary_plugins($class,$secondary_plugin_options, $hashArgOptLists);
[10274]	139
	140	return $self;
[1685]	141	}
	142
	143
	144	sub get_default_block_exp {
	145	my $self = shift (@_);
	146
	147	return q^(?i)\.(eps)$^;
	148	}
	149
	150	sub get_default_process_exp {
	151	my $self = shift (@_);
	152
	153	return q^(?i)\.ps$^;
	154	}
	155
[15872]	156	# this has been commented out in other plugins. do we need it here?
[17330]	157	# ps files are converted to images (item file should be in utf8) or text (uses pstoascii), so we shouldn't need to ensure utf8
[10274]	158	sub convert_post_process
	159	{
	160	my $self = shift (@_);
	161	my ($conv_filename) = @_;
	162
	163	my $outhandle=$self->{'outhandle'};
	164
	165	my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
	166
	167	# read in file ($text will be in utf8)
	168	my $text = "";
	169	$self->read_file ($conv_filename, $encoding, $language, \$text);
	170
	171	# turn any high bytes that aren't valid utf-8 into utf-8.
	172	unicode::ensure_utf8(\$text);
	173
	174	# Write it out again!
	175	$self->utf8_write_file (\$text, $conv_filename);
	176	}
	177
[1700]	178	sub extract_metadata_from_postscript {
	179	my $self = shift (@_);
[10274]	180
	181	my ($filename,$doc) = @_;
	182
[1700]	183	my $section = $doc->get_top_section();
	184
	185	my $title_found = 0;
	186	my $pages_found = 0;
	187	my $date_found = 0;
	188
[17725]	189	print STDERR "PostScriptPlugin: extracting PostScript metadata from \"$filename\"\n"
[1700]	190	if $self->{'verbosity'} > 1;
	191
	192	open(INPUT, "<$filename");
	193	my $date;
	194
	195	while(my $line =<INPUT>) {
	196	if ($self->{'extract_title'} && !$title_found) {
	197	foreach my $word ($line =~ m\|Title: ([-A-Za-z0-9@/\/\(\):,. ]*)\|g) {
[1706]	198	my $new_word = $word;
	199	$new_word =~ s/\(Untitled\)//i;
	200	$new_word =~ s/\(Microsoft Word\)//i;
	201	$new_word =~ s/Microsoft Word//i;
	202	$new_word =~ s/^\(//i;
	203	$new_word =~ s/\)$//i;
	204	$new_word =~ s/^ - //i;
	205	if ($new_word ne "") {
	206	$doc->add_utf8_metadata($section, "Title", $new_word );
[1700]	207	$title_found = 1;
	208	}
	209	}
	210	}
	211	if ($self->{'extract_date'} && !$date_found) {
	212	foreach my $word ($line =~ m/(Creation[-A-Za-z0-9@\/\(\):,. ]*)/g) {
	213	if ($word =~ m/ ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) ?[0-9: ]+ ([0-9]{4})/) {
	214	$date = &sorttools::format_date($2,$1,$3);
[1712]	215	if (defined $date) {
	216	$doc->add_utf8_metadata($section, "Date", $date );
	217	}
[1700]	218	}
	219	if ($word =~ m/D:([0-9]{4})([0-9]{2})([0-9]{2})[0-9]{6}\)/) {
	220	$date = &sorttools::format_date($3,$2,$1);
[1712]	221	if (defined $date) {
	222	$doc->add_utf8_metadata($section, "Date", $date );
	223	}
[1700]	224	}
	225	if ($word =~ m/CreationDate: ([0-9]{4}) ([A-Za-z][A-Za-z][A-Za-z]) ([0-9 ][0-9]) [0-9:]*/) {
	226	$date = &sorttools::format_date($3,$2,$1);
[1712]	227	if (defined $date) {
	228	$doc->add_utf8_metadata($section, "Date", $date );
	229	}
[1700]	230	}
	231	$date_found = 1;
	232	}
	233	}
	234	if ($self->{'extract_pages'} && !$pages_found) {
	235	foreach my $word ($line =~ m/(Pages: [0-9]*)/g) {
	236	my $digits = $word;
	237	$digits =~ s/[^0-9]//g;
	238	if ($digits ne "" && $digits ne "0") {
	239	$doc->add_utf8_metadata($section, "Pages", $digits );
	240	$pages_found = 1;
	241	}
	242	}
	243	}
	244	}
	245	}
	246
[15872]	247	# do plugin specific processing of doc_obj
[1685]	248	sub process {
	249	my $self = shift (@_);
[15872]	250	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1685]	251
[10274]	252	my $filename = &util::filename_cat($base_dir,$file);
	253	$self->extract_metadata_from_postscript($filename, $doc_obj);
	254
[15872]	255	return $self->SUPER::process(@_);
	256
[1685]	257	}
	258
	259
	260	1;
	261

Note: See TracBrowser for help on using the repository browser.

Download in other formats: