Context Navigation

source: trunk/gsdl/perllib/plugins/ConvertToPlug.pm@ 2029

Last change on this file since 2029 was 2029, checked in by jrm21, 23 years ago
Return 0 instead of "" on error in read() so that RecPlug can continue.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.1 KB

Rev	Line
[1410]	1	###########################################################################
	2	#
	3	# ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending
	4	# on plugin argument convert_to
	5	#
	6	# A component of the Greenstone digital library software
	7	# from the New Zealand Digital Library Project at the
	8	# University of Waikato, New Zealand.
	9	#
	10	# Copyright (C) 1999 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28	# The plugin is inherited by such plugins as WordPlug and PDFPlug.
	29	# It facilitates the conversion of these document types to either HTML
	30	# or TEXT by setting up variable that instruct ConvertToBasPlug
	31	# how to work.
	32
	33	# It works by dynamically inheriting HTMLPlug or TEXTPlug based on
	34	# the plugin argument 'convert_to'. If the argument is not present,
	35	# the default is to inherit HTMLPlug.
	36
	37
	38	package ConvertToPlug;
	39
[1446]	40	use BasPlug;
[1410]	41	use HTMLPlug;
	42	use TEXTPlug;
	43
	44	sub BEGIN {
[1446]	45	@ISA = ('HTMLPlug');
	46	# @ISA = ('HTMLPlug', 'TEXTPlug');
	47	# @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug');
[1410]	48	}
	49
	50	use strict;
	51
	52	sub print_usage {
	53	my ($plugin_name) = @_;
[1741]	54
	55	# for when this function is called directly by pluginfo.pl
	56	if (ref ($plugin_name)) {
	57	$plugin_name = ref ($plugin_name);
	58	}
[1410]	59
	60	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
	61	print STDERR " options:\n";
	62	print STDERR " -convert_to (html\|text) plugin converts to TEXT or HTML\n";
	63	print STDERR " (default html)\n";
	64	}
	65
	66	sub parse_args
	67	{
	68	my $class = shift (@_);
	69	my ($args) = @_;
	70
	71	my $plugin_name = $class;
	72	$plugin_name =~ s/\.pm$//;
[1415]	73
[1410]	74	my $generate_format;
[1954]	75	my $kea_arg;
	76
	77	if (!parsargv::parse($args,
	78	q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options
	79	q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options
[1410]	80	q^convert_to/(html\|text)/html^, \$generate_format,
	81	"allow_extra_options")) {
	82
	83	print STDERR "\nIncorrect options passed to $plugin_name, ";
	84	print STDERR "check your collect.cfg configuration file\n";
	85	&print_usage($plugin_name);
	86	die "\n";
	87	}
[1954]	88
	89	return ($plugin_name,$generate_format, $kea_arg);
[1410]	90	}
	91
	92	sub new {
	93	my $class = shift (@_);
[1954]	94	my ($plugin_name,$generate_format, $kea_arg) = $class->parse_args(\@_);
[1410]	95	my $self;
	96
	97	if ($generate_format eq "text")
	98	{
	99	$self = new TEXTPlug ($class, @_);
[1435]	100	$self->{'convert_to'} = "TEXT";
	101	$self->{'convert_to_ext'} = "txt";
[1410]	102	}
	103	else
	104	{
	105	$self = new HTMLPlug ($class, @_);
[1435]	106	$self->{'convert_to'} = "HTML";
	107	$self->{'convert_to_ext'} = "html";
[1446]	108
	109	$self->{'rename_assoc_files'} = 1;
	110	$self->{'metadata_fields'} .= ",GENERATOR";
[1410]	111	}
	112
[1954]	113	#if kea data to be extracted...
	114	$self->{'kea'} = 1 if($kea_arg->{'kea'});
	115	$self->{'kea_options'} = 1 if($kea_arg->{'kea_options'});
	116
[1410]	117	return bless $self, $class;
	118	}
	119
	120
[1435]	121
[1446]	122	# Run conversion utility on the input file.
	123	#
	124	# The conversion takes place in a collection specific 'tmp' directory so
	125	# that we don't accidentally damage the input.
	126	#
	127	# The desired output type is indicated by $output_ext. This is usually
	128	# something like "html" or "word", but can be "best" (or the empty string)
	129	# to indicate that the conversion utility should do the best it can.
[1435]	130
	131	sub tmp_area_convert_file {
	132	my $self = shift (@_);
	133	my ($output_ext,$input_filename, $textref) = @_;
	134
[1446]	135	my $convert_to = $self->{'convert_to'};
	136
[1435]	137	# softlink to collection tmp dir
	138	my $colname = &util::use_collection();
	139	my $tmp_dirname
	140	= &util::filename_cat($ENV{'GSDLHOME'},"collect",$colname,"tmp");
	141	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
	142
	143	# derive tmp filename from input filename
	144	my ($tailname,$dirname,$suffix)
[1705]	145	= File::Basename::fileparse($input_filename,'\.[^\.]+$');
	146
[1435]	147	my $tmp_filename = &util::filename_cat($tmp_dirname,"$tailname$suffix");
	148	# Remove any white space from filename -- no risk of name collision, and
[1446]	149	# makes later conversion by utils simpler
[1435]	150	$tmp_filename =~ s/\s+//g;
	151
	152	&util::soft_link($input_filename,$tmp_filename);
	153
	154	my $verbosity = $self->{'verbosity'};
	155	if ($verbosity>0)
	156	{
[1446]	157	print STDERR "Converting $tailname$suffix to $convert_to format\n";
[1435]	158	}
	159
[1446]	160	# Execute the conversion command and get the type of the result,
	161	# making sure the converter gives us the appropriate output type
	162	my $output_type = lc($convert_to);
	163	my $cmd = "gsConvert.pl -verbose $verbosity -output $output_type \"$tmp_filename\"";
	164	$output_type = `$cmd`;
	165
	166	# Check STDERR here
	167
	168	chomp $output_type;
	169	if ($output_type eq "fail") {
	170	print STDERR "Could not convert $tailname$suffix to $convert_to format\n";
[1691]	171	return "";
	172	### exit 1;
[1435]	173	}
	174
	175	# remove symbolic link to original file
	176	&util::rm($tmp_filename);
	177
[1446]	178	# store the actual output type and return the output filename
	179	$self->{'convert_to_ext'} = $output_type;
	180	my $output_filename = $tmp_filename;
	181	$output_filename =~ s/$suffix$/.$output_type/;
	182
[1435]	183	return $output_filename;
	184	}
	185
	186
	187	# Remove collection specific tmp directory and all its contents.
	188
	189	sub cleanup_tmp_area {
	190	my $self = shift (@_);
	191
	192	my $colname = &util::use_collection();
	193	my $tmp_dirname
	194	= &util::filename_cat($ENV{'GSDLHOME'},"collect",$colname,"tmp");
	195	&util::rm_r($tmp_dirname);
	196	&util::mk_dir($tmp_dirname);
	197	}
	198
	199
	200
[1420]	201
	202	# Override BasPlug read
[2027]	203	# We don't want to get language encoding stuff until after we've converted
	204	# our file to either TEXT or HTML.
[1420]	205	sub read {
	206	my $self = shift (@_);
[2027]	207	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
	208	# if ($self->is_recursive()) {
	209	# die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
	210	# }
	211
	212	my $outhandle = $self->{'outhandle'};
	213
	214	my $filename = &util::filename_cat($base_dir, $file);
	215	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
	216	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	217	return undef;
	218	}
	219	my $plugin_name = ref ($self);
	220	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	221
	222	# read in file ($text will be in utf8)
	223	my $text = "";
	224
	225	my $output_ext = $self->{'convert_to_ext'};
	226	my $conv_filename = $self->tmp_area_convert_file($output_ext,$filename);
[2029]	227	if ("$conv_filename" eq "") {return 0;} # allows continue on errors
[2027]	228	$self->{'conv_filename'} = $conv_filename;
	229
	230	# Do encoding stuff
	231	my ($language, $encoding);
	232	if ($self->{'input_encoding'} eq "auto") {
	233	# use textcat to automatically work out the input encoding and language
	234	($language, $encoding) = $self->get_language_encoding ($conv_filename);
	235	} elsif ($self->{'extract_language'}) {
	236	# use textcat to get language metadata
	237
	238	my ($language, $extracted_encoding) = $self->get_language_encoding ($conv_filename);
	239	$encoding = $self->{'input_encoding'};
	240	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
	241	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
	242	print $outhandle "appears to be encoded as $extracted_encoding.\n";
	243	}
	244	} else {
	245	$language = $self->{'default_language'};
	246	$encoding = $self->{'input_encoding'};
	247	}
	248
	249	BasPlug::read_file($self,$conv_filename, $encoding, \$text);
	250	if (!length ($text)) {
	251	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
	252	return 0;
	253	}
	254
	255	# create a new document
	256	my $doc_obj = new doc ($conv_filename, "indexed_doc");
	257
	258	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language",
	259	$language);
	260	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding",
	261	$encoding);
	262
	263
	264	# include any metadata passed in from previous plugins
	265	# note that this metadata is associated with the top level section
	266	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	267	# do plugin specific processing of doc_obj
	268	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
	269	# do any automatic metadata extraction
	270	$self->auto_extract_metadata ($doc_obj);
	271	# add an OID
	272	$doc_obj->set_OID();
	273	# process the document
	274	$processor->process($doc_obj);
[1420]	275	$self->cleanup_tmp_area();
[2027]	276
	277
	278	return 1;
[1420]	279	}
	280
	281
[1410]	282	# do plugin specific processing of doc_obj for HTML type
	283	sub process_type {
	284	my $self = shift (@_);
	285	my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	286
	287	my $conv_filename = $self->{'conv_filename'};
	288	my $tmp_dirname = File::Basename::dirname($conv_filename);
	289	my $tmp_tailname = File::Basename::basename($conv_filename);
[1929]	290
[1410]	291	my $convert_to = $self->{'convert_to'};
	292	my $ret_val;
	293
	294	if ($convert_to eq "TEXT")
	295	{
[1446]	296
[1410]	297	$ret_val = TEXTPlug::process($self,$textref,$pluginfo,
	298	$tmp_dirname,$tmp_tailname,
	299	$metadata,$doc_obj);
	300	}
	301	else
	302	{
	303	$ret_val = HTMLPlug::process($self,$textref,$pluginfo,
	304	$tmp_dirname,$tmp_tailname,
	305	$metadata,$doc_obj);
	306	}
	307
	308	# associate original file with doc object
	309	my $cursection = $doc_obj->get_top_section();
	310	my $filename = &util::filename_cat($base_dir,$file);
	311	$doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection);
	312
[1435]	313	my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.$doc_ext>";
	314	$doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink);
[1410]	315	$doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_");
	316	$doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
	317	return $ret_val;
	318	}
	319
	320	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: