########################################################################### # # ConvertToPlug.pm -- plugin that inherits from HTML or TEXT Plug, depending # on plugin argument convert_to # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # The plugin is inherited by such plugins as WordPlug and PDFPlug. # It facilitates the conversion of these document types to either HTML # or TEXT by setting up variable that instruct ConvertToBasPlug # how to work. # It works by dynamically inheriting HTMLPlug or TEXTPlug based on # the plugin argument 'convert_to'. If the argument is not present, # the default is to inherit HTMLPlug. package ConvertToPlug; use BasPlug; use HTMLPlug; use TEXTPlug; use ghtml; sub BEGIN { @ISA = ('HTMLPlug'); # @ISA = ('HTMLPlug', 'TEXTPlug'); # @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug'); } my $convert_to_list = [ { 'name' => "html", 'desc' => "HTML format" }, { 'name' => "text", 'desc' => "Plain text format" } ]; my $arguments = [ { 'name' => "convert_to", 'desc' => "Plugin converts to TEXT or HTML.", 'type' => "enum", 'reqd' => "no", 'list' => $convert_to_list, 'deft' => "html" }, { 'name' => "use_strings", 'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.", 'type' => "flag", 'reqd' => "no" } ]; my $options = { 'name' => "ConvertToPlug", 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.", 'inherits' => "Yes", 'args' => $arguments }; sub print_usage { my ($plugin_name) = @_; # for when this function is called directly by pluginfo.pl if (ref ($plugin_name)) { $plugin_name = ref ($plugin_name); } print STDERR "\n usage: plugin $plugin_name [options]\n\n"; print STDERR " options:\n"; print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n"; print STDERR " (default html)\n"; print STDERR " -use_strings if set a simple strings function\n"; print STDERR " will be called to extract text\n"; print STDERR " if the conversion utility fails\n"; } sub parse_args { my $class = shift (@_); my ($args) = @_; my $plugin_name = $class; $plugin_name =~ s/\.pm$//; my $newargs = {}; if (!parsargv::parse($args, q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented) q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented) q^convert_to/(html|text)/html^, \$newargs->{'generate_format'}, q^use_strings^, \$newargs->{'use_strings'}, "allow_extra_options")) { print STDERR "\nIncorrect options passed to $plugin_name, "; print STDERR "check your collect.cfg configuration file\n"; &print_usage($plugin_name); die "\n"; } return ($plugin_name, $newargs); } sub new { my $class = shift (@_); # print "Class: " . $class . "\n"; # if ($class eq "ConvertToPlug") {$class = shift (@_);} my $self; # parsargv::parse might modify the list, so we do this by creating a copy # of the argument list. my @arglist = @_; my ($plugin_name, $args) = $class->parse_args(\@_); if ($class eq "PDFPlug" && $args->{'generate_format'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; $args->{'generate_format'} = "html"; } if ($args->{'generate_format'} eq "text") { $self = new TEXTPlug ($class, @arglist); $self->{'convert_to'} = "TEXT"; $self->{'convert_to_ext'} = "txt"; } else { $self = new HTMLPlug ($class, @arglist); $self->{'convert_to'} = "HTML"; $self->{'convert_to_ext'} = "html"; $self->{'rename_assoc_files'} = 1; $self->{'metadata_fields'} .= ",GENERATOR"; } # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); foreach my $key (keys %$args) { $self->{$key} = $args->{$key}; } return bless $self, $class; } # Run conversion utility on the input file. # # The conversion takes place in a collection specific 'tmp' directory so # that we don't accidentally damage the input. # # The desired output type is indicated by $output_ext. This is usually # something like "html" or "word", but can be "best" (or the empty string) # to indicate that the conversion utility should do the best it can. sub tmp_area_convert_file { my $self = shift (@_); my ($output_ext, $input_filename, $textref) = @_; my $outhandle = $self->{'outhandle'}; my $convert_to = $self->{'convert_to'}; my $failhandle = $self->{'failhandle'}; # softlink to collection tmp dir my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); # derive tmp filename from input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); # Remove any white space from filename -- no risk of name collision, and # makes later conversion by utils simpler. Leave spaces in path... $tailname =~ s/\s+//g; my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); &util::soft_link($input_filename, $tmp_filename); my $verbosity = $self->{'verbosity'}; if ($verbosity > 0) { print $outhandle "Converting $tailname$suffix to $convert_to format\n"; } my $errlog = &util::filename_cat($tmp_dirname, "err.log"); # Execute the conversion command and get the type of the result, # making sure the converter gives us the appropriate output type my $output_type = lc($convert_to); my $cmd = "perl -S gsConvert.pl -verbose $verbosity "; if (defined $self->{'convert_options'}) { $cmd .= $self->{'convert_options'} . " "; } if ($self->{'use_strings'}) { $cmd .= "-use_strings "; } $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\""; $output_type = `$cmd`; # remove symbolic link to original file &util::rm($tmp_filename); # Check STDERR here chomp $output_type; if ($output_type eq "fail") { print $outhandle "Could not convert $tailname$suffix to $convert_to format\n"; print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n"; $self->{'num_not_processed'} ++; if (-s "$errlog") { open(ERRLOG, "$errlog"); while () { print $outhandle "$_"; } print $outhandle "\n"; close ERRLOG; } &util::rm("$errlog") if (-e "$errlog"); return ""; } # store the *actual* output type and return the output filename # it's possible we requested conversion to html, but only to text succeeded $self->{'convert_to_ext'} = $output_type; if ($output_type =~ /html/i) { $self->{'converted_to'} = "HTML"; } elsif ($output_type =~ /te?xt/i) { $self->{'converted_to'} = "TEXT"; } my $output_filename = $tmp_filename; $output_filename =~ s/$suffix$/.$output_type/; return $output_filename; } # Remove collection specific tmp directory and all its contents. sub cleanup_tmp_area { my $self = shift (@_); my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); &util::rm_r($tmp_dirname); &util::mk_dir($tmp_dirname); } # Override BasPlug read # We don't want to get language encoding stuff until after we've converted # our file to either TEXT or HTML. sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; # if ($self->is_recursive()) { # die "BasPlug::read function must be implemented in sub-class for recursive plugins\n"; # } my $outhandle = $self->{'outhandle'}; my $filename = $file; $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { $self->{'num_blocked'} ++; return 0; } if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { return undef; } $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up # read in file ($text will be in utf8) my $text = ""; my $output_ext = $self->{'convert_to_ext'}; my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename); if ("$conv_filename" eq "") {return 0;} # allows continue on errors if (! -e "$conv_filename") {return 0;} # allows continue on errors $self->{'conv_filename'} = $conv_filename; # Do encoding stuff my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); &BasPlug::read_file($self, $conv_filename, $encoding, $language, \$text); if (!length ($text)) { my $plugin_name = ref ($self); print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; return 0; } # if we converted to HTML, convert é and etc to utf-8. # this should really happen before language_extraction, but that means # modifying a file on disk... $text =~ s/&([^;]+);/&ghtml::getcharequiv($1,0)/ge; # create a new document my $doc_obj = new doc ($conv_filename, "indexed_doc"); $doc_obj->set_OIDtype ($processor->{'OIDtype'}); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); my ($filemeta) = $file =~ /([^\\\/]+)$/; $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); if ($self->{'cover_image'}) { $self->associate_cover_image($doc_obj, $filename); } # include any metadata passed in from previous plugins # note that this metadata is associated with the top level section $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); # do plugin specific processing of doc_obj return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # add an OID $doc_obj->set_OID(); # process the document $processor->process($doc_obj); $self->cleanup_tmp_area(); $self->{'num_processed'} ++; return 1; } # do plugin specific processing of doc_obj for HTML type sub process_type { my $self = shift (@_); my ($doc_ext, $textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; my $conv_filename = $self->{'conv_filename'}; my $tmp_dirname = File::Basename::dirname($conv_filename); my $tmp_tailname = File::Basename::basename($conv_filename); my $converted_to = $self->{'converted_to'}; my $ret_val; if ($converted_to eq "TEXT") { $ret_val = &TEXTPlug::process($self, $textref, $pluginfo, $tmp_dirname, $tmp_tailname, $metadata, $doc_obj); } else { $ret_val = &HTMLPlug::process($self, $textref, $pluginfo, $tmp_dirname, $tmp_tailname, $metadata, $doc_obj); } # associate original file with doc object my $cursection = $doc_obj->get_top_section(); my $filename = &util::filename_cat($base_dir, $file); $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection); my $doclink = ""; $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink); $doc_obj->add_utf8_metadata ($cursection, "srcicon", "_icon".$doc_ext."_"); $doc_obj->add_utf8_metadata ($cursection, "/srclink", ""); return $ret_val; } 1;