########################################################################### # # CONTENTdmPlugin.pm -- reasonably with-it pdf plugin # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999-2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package CONTENTdmPlugin; use ConvertBinaryFile; use ReadXMLFile; use unicode; use ghtml; use strict; no strict 'refs'; # so we can use a var for filehandles (eg STDERR) use XMLParser; # inherit ReadXMLFile for the apply_xslt method sub BEGIN { @CONTENTdmPlugin::ISA = ('ConvertBinaryFile', 'ReadXMLFile'); } my $convert_to_list = [ { 'name' => "auto", 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, { 'name' => "html", 'desc' => "{ConvertBinaryFile.convert_to.html}" }, { 'name' => "text", 'desc' => "{ConvertBinaryFile.convert_to.text}" }, { 'name' => "pagedimg", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg}"}, ]; my $arguments = [ { 'name' => "convert_to", 'desc' => "{ConvertBinaryFile.convert_to}", 'type' => "enum", 'reqd' => "yes", 'list' => $convert_to_list, 'deft' => "html" }, { 'name' => "xslt", 'desc' => "{ReadXMLFile.xslt}", 'type' => "string", 'deft' => "", 'reqd' => "no" }, { 'name' => "process_exp", 'desc' => "{BasePlugin.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{BasePlugin.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp() } ]; my $options = { 'name' => "CONTENTdmPlugin", 'desc' => "{CONTENTdmPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", # CONTENTdmPlugin is one of the few ConvertBinaryFile subclasses whose source doc can't be replaced by a GS-generated html 'srcreplaceable' => "no", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@$inputargs,"-title_sub"); push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my @arg_array = @$inputargs; my $self = new ConvertBinaryFile($pluginlist,$inputargs,$hashArgOptLists); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } my $parser = new XML::Parser('Style' => 'Stream', 'Pkg' => 'ReadXMLFile', 'PluginObj' => $self, 'Handlers' => {'Char' => \&ReadXMLFile::Char, 'XMLDecl' => \&ReadXMLFile::XMLDecl, 'Entity' => \&ReadXMLFile::Entity, 'Doctype' => \&ReadXMLFile::Doctype, 'Default' => \&ReadXMLFile::Default }); $self->{'parser'} = $parser; $self->{'rdf_desc'} = undef; $self->{'about_key'} = undef; $self->{'metadata_name'} = undef; $self->{'metadata_value'} = undef; $self->{'convert_to'} = "PagedImage"; my $secondary_plugin_options = $self->{'secondary_plugin_options'}; if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){ $secondary_plugin_options->{'PagedImagePlugin'} = []; } my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'}; push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); push(@$pagedimg_options, "-create_thumbnail", "true", "-create_screenview", "true"); push(@$pagedimg_options, "-file_rename_method", "none"); push(@$pagedimg_options, "-processing_tmp_files"); $self = bless $self, $class; # ***** no longer needed! # # This needs to be done after blss, to $self passed to XML::Parser # # can correctly resolve the right call-back methods during XML parsing $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); return $self; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.rdf$^; } sub get_default_block_exp { return q^(?i)\.(jpg|jpeg|gif)$^; } sub rdf_desc_to_id { my $self = shift (@_); my ($rdf_desc) = @_; my $rdf_id = {}; # initialise any .cpd (=complex multi page) structures foreach my $about_key (keys %{$rdf_desc}) { if ($about_key =~ m/\.cpd$/) { my $about = $rdf_desc->{$about_key}; my $id = $about->{'dc:identifier'}; if ($id =~ m/^\s*$/) { # missing id, make one up based on about attribute my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$"); $id = "about:$tailname"; } $rdf_id->{$id} = $about; $rdf_id->{$id}->{'ex:filename'} = $about_key; $rdf_id->{$id}->{'ex:type'} = "complex"; $rdf_id->{$id}->{'pages'} = []; } } # now add in *non* .cpd items foreach my $about_key (keys %{$rdf_desc}) { if ($about_key !~ m/\.cpd$/) { my $about = $rdf_desc->{$about_key}; my $id = $about->{'dc:identifier'}; if ($id =~ m/^\s*$/) { # missing id, make one up based on about attribute my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($about_key, "\\.[^\\.]+\$"); $id = "about:$tailname"; } if (defined $rdf_id->{$id}) { $about->{'ex:filename'} = $about_key; # dealing with complex multi-page situation # Add to existing structure my $pages = $rdf_id->{$id}->{'pages'}; push(@$pages,$about) } else { # New entry $rdf_id->{$id} = $about; $rdf_id->{$id}->{'ex:type'} = "simple"; $rdf_id->{$id}->{'ex:filename'} = $about_key; } } } return $rdf_id; } sub metadata_table_txt_file { my $self = shift (@_); my ($output_root,$page_num) = @_; my $txt_filename = $output_root."_page_$page_num.txt"; my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($txt_filename, "\\.[^\\.]+\$"); my $txt_file = "$tailname$suffix"; return $txt_file; } sub output_metadata_table { my $self = shift (@_); my ($page,$page_num,$tmp_dirname,$txt_file) = @_; my $txt_filename = &util::filename_cat($tmp_dirname,$txt_file); open(TOUT,">$txt_filename") || die "Error: unable to write metadata data out as txt file $txt_filename: $!\n"; print TOUT $page->{'MetadataTable'}; delete $page->{'MetadataTable'}; close (TOUT); } sub rdf_id_to_item_file { my $self = shift (@_); my ($rdf_id,$tmp_dirname,$output_root) = @_; my $item_file_list = []; foreach my $id (keys %{$rdf_id}) { my $id_safe = $id; $id_safe =~ s/ /-/g; my $output_filename = $output_root."_$id_safe.item"; open(FOUT,">$output_filename") || die "Unable to open $output_filename: $!\n"; print FOUT "\n"; my $rdf_doc = $rdf_id->{$id}; foreach my $metadata_name (keys %$rdf_doc) { next if ($metadata_name eq "pages"); my $metadata_value = $rdf_doc->{$metadata_name}; # convert ns:name to ns.Name $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/; print FOUT " $metadata_value\n"; } if ($rdf_doc->{'ex:type'} eq "complex") { my $pages = $rdf_doc->{'pages'}; my $page_num = 1; foreach my $page (@$pages) { my $imgfile = $page->{'ex:filename'}; if ($imgfile =~ m/(http|ftp):/) { $imgfile = "empty.jpg"; } else { $imgfile = &util::filename_cat("..","import",$imgfile); } my $txt_file = $self->metadata_table_txt_file($output_root,$page_num); $self->output_metadata_table($page,$page_num, $tmp_dirname,$txt_file); print FOUT " \n"; foreach my $metadata_name (keys %$page) { my $metadata_value = $rdf_doc->{$metadata_name}; # convert ns:name to ns.Name $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/; print FOUT " $metadata_value\n"; } $page_num++; print FOUT " \n"; } } else { # simple # duplicate top-level metadata for now plus image to bind to my $imgfile = $rdf_doc->{'ex:filename'}; if ($imgfile =~ m/(http|ftp):/) { $imgfile = "empty.jpg"; } else { $imgfile = &util::filename_cat("..","import",$imgfile); } my $txt_file = $self->metadata_table_txt_file($output_root,1); $self->output_metadata_table($rdf_doc,1,$tmp_dirname,$txt_file); print FOUT " \n"; foreach my $metadata_name (keys %$rdf_doc) { my $metadata_value = $rdf_doc->{$metadata_name}; # convert ns:name to ns.Name $metadata_name =~ s/^(.*?):(.*)/$1\.\u$2/; print FOUT " $metadata_value\n"; } print FOUT " \n"; } print FOUT "\n"; close(FOUT); push(@$item_file_list,$output_filename); } return $item_file_list; } sub xml_area_convert_file { my $self = shift (@_); my ($input_filename, $tmp_dirname, $output_root) = @_; eval { # Build up hash table/tree of all records my $xslt = $self->{'xslt'}; if (defined $xslt && ($xslt ne "")) { # perform xslt my $transformed_xml = $self->apply_xslt($xslt,$input_filename); open(TOUT,">/tmp/tout.xml") || die "Unable to open /tmp/tout.xml: $!\n"; print TOUT $transformed_xml; close(TOUT); # feed transformed file (now in memory as string) into XML parser $self->{'parser'}->parse($transformed_xml); } else { $self->{'parser'}->parsefile($input_filename); } }; if ($@) { # parsefile may either croak somewhere in XML::Parser (e.g. because # the document is not well formed) or die somewhere in XMLPlug or a # derived plugin (e.g. because we're attempting to process a # document whose DOCTYPE is not meant for this plugin). For the # first case we'll print a warning and continue, for the second # we'll just continue quietly print STDERR "**** Error is: $@\n"; my $file = $self->{'file'}; my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/; if (defined $msg) { my $outhandle = $self->{'outhandle'}; my $plugin_name = ref ($self); print $outhandle "$plugin_name failed to process $file ($msg)\n"; } my $gli = $self->{'gli'}; # reset ourself for the next document $self->{'section_level'}=0; print STDERR "\n" if ($gli); return ("fail",undef); # error during processing } my $rdf_desc = $self->{'rdf_desc'}; # foreach my $about_key (keys %{$rdf_desc}) { # my $about = $rdf_desc->{$about_key}; # foreach my $metadata_name (keys %{$about}) { # # my $metadata_value = $about->{$metadata_name}; ## print STDERR " $metadata_name: $metadata_value\n"; # } # } # Merge entries with same name my $merged_rdf_id = $self->rdf_desc_to_id($rdf_desc); # foreach my $about_key (keys %{$merged_rdf_id}) { # my $about = $merged_rdf_id->{$about_key}; # foreach my $metadata_name (keys %{$about}) { # # my $metadata_value = $about->{$metadata_name}; ## print STDERR " $metadata_name: $metadata_value\n"; # } # } my $item_files = $self->rdf_id_to_item_file($merged_rdf_id,$tmp_dirname, $output_root); return ("item",$item_files); } # Override ConvertBinaryFile tmp_area_convert_file() to provide solution specific # to CONTENTdm # # A better (i.e. in the future) solution would be to see if this can be # shifted into gsConvert.pl so there is no need to override the # default tmp_area_convert_file() sub tmp_area_convert_file { my $self = shift (@_); my ($output_ext, $input_filename, $textref) = @_; # is textref ever used?!? my $outhandle = $self->{'outhandle'}; my $convert_to = $self->{'convert_to'}; my $failhandle = $self->{'failhandle'}; my $convert_to_ext = $self->{'convert_to_ext'}; # softlink to collection tmp dir my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp"); &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); # derive tmp filename from input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); # Remove any white space from filename -- no risk of name collision, and # makes later conversion by utils simpler. Leave spaces in path... # tidy up the filename with space, dot, hyphen between $tailname =~ s/\s+//g; $tailname =~ s/\.+//g; $tailname =~ s/\-+//g; $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname); $suffix = lc($suffix); my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); &util::soft_link($input_filename, $tmp_filename); my $verbosity = $self->{'verbosity'}; if ($verbosity > 0) { print $outhandle "Converting $tailname$suffix to $convert_to format\n"; } my $errlog = &util::filename_cat($tmp_dirname, "err.log"); # call xml_area_convert_file rather than gsConvert.pl my $output_root = &util::filename_cat($tmp_dirname, "$tailname"); my ($output_type,$item_files) = $self->xml_area_convert_file($tmp_filename,$tmp_dirname,$output_root); my $fakeimg_filename = &util::filename_cat($dirname, "empty.jpg"); my $fakeimg_tmp_filename = &util::filename_cat($tmp_dirname, "empty.jpg"); print STDERR "***** No source image identified with item\n"; print STDERR "***** Using default \"no image available\" $fakeimg_filename -> $fakeimg_tmp_filename\n"; &util::soft_link($fakeimg_filename, $fakeimg_tmp_filename); # continue as before ... # remove symbolic link to original file &util::rm($tmp_filename); # Check STDERR here chomp $output_type; if ($output_type eq "fail") { print $outhandle "Could not convert $tailname$suffix to $convert_to format\n"; print $failhandle "$tailname$suffix: " . ref($self) . " failed to convert to $convert_to\n"; $self->{'num_not_processed'} ++; if (-s "$errlog") { open(ERRLOG, "$errlog"); while () { print $outhandle "$_"; } print $outhandle "\n"; close ERRLOG; } &util::rm("$errlog") if (-e "$errlog"); return []; } # store the *actual* output type and return the output filename # it's possible we requested conversion to html, but only to text succeeded #$self->{'convert_to_ext'} = $output_type; if ($output_type =~ /html/i) { $self->{'converted_to'} = "HTML"; } elsif ($output_type =~ /te?xt/i) { $self->{'converted_to'} = "Text"; } elsif ($output_type =~ /item/i){ $self->{'converted_to'} = "PagedImage"; } return $item_files; } # Override ConvertBinaryFile read # Needed so multiple .item files generated are sent down secondary plugin sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; $self->{'gli'} = $gli; $self->{'file'} = $file; my $successful_rv = -1; my $outhandle = $self->{'outhandle'}; my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); return undef unless $self->can_process_this_file($filename_full_path); $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up # read() deviates at this point from ConvertBinaryFile # Need to work with list of filename returned my $output_ext = $self->{'convert_to_ext'}; my $conv_filename_list = []; $conv_filename_list = $self->tmp_area_convert_file($output_ext, $filename_full_path); if (scalar(@$conv_filename_list)==0) { return -1; } # had an error, will be passed down pipeline foreach my $conv_filename ( @$conv_filename_list ) { if (! -e "$conv_filename") {return -1;} $self->{'conv_filename'} = $conv_filename; # is this used anywhere? $self->convert_post_process($conv_filename); my $secondary_plugins = $self->{'secondary_plugins'}; my $num_secondary_plugins = scalar(keys %$secondary_plugins); if ($num_secondary_plugins == 0) { print $outhandle "Warning: No secondary plugin to use in conversion. Skipping $file\n"; return 0; # effectively block it } my @plugin_names = keys %$secondary_plugins; my $plugin_name = shift @plugin_names; if ($num_secondary_plugins > 1) { print $outhandle "Warning: Multiple secondary plugins not supported yet! Choosing $plugin_name\n."; } my $secondary_plugin = $secondary_plugins->{$plugin_name}; # note: metadata is not carried on to the next level my ($rv,$doc_obj) = $secondary_plugin->read_into_doc_obj ($pluginfo,"", $conv_filename, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli); print STDERR "**** $conv_filename => returned rv = $rv\n"; if ((defined $rv) && ($rv>=0)) { $successful_rv = 1; } # Override previous gsdlsourcefilename set by secondary plugin my $collect_file = &util::filename_within_collection($filename_full_path); my $collect_conv_file = &util::filename_within_collection($conv_filename); $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'}); $doc_obj->set_converted_filename($collect_conv_file); my ($filemeta) = $file =~ /([^\\\/]+)$/; $self->set_Source_metadata($doc_obj, $filemeta); $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); if ($self->{'cover_image'}) { $self->associate_cover_image($doc_obj, $filename_full_path); } # do plugin specific processing of doc_obj unless (defined ($self->process(undef, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) { print STDERR "***** process returned undef: $base_dir $file\n"; print STDERR "\n" if ($gli); return -1; } # do any automatic metadata extraction $self->auto_extract_metadata ($doc_obj); # have we found a Title?? $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$filemeta); # add an OID $self->add_OID($doc_obj); # process the document $processor->process($doc_obj); $self->{'num_processed'} ++; } return $successful_rv; } sub process { return 1; } # do we need this? sec pluginn process would have already been called as part of read_into_doc_obj?? sub process_old { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $secondary_plugins = $self->{'secondary_plugins'}; my @plugin_names = keys %$secondary_plugins; my $plugin_name = shift @plugin_names; # already checked there is only one my $secondary_plugin = $secondary_plugins->{$plugin_name}; my $result = $secondary_plugin->process(@_); return $result; } # Called at the beginning of the XML document. sub xml_start_document { my $self = shift(@_); my ($expat) = @_; $self->{'rdf_desc'} = {}; } # Called for DOCTYPE declarations - use die to bail out if this doctype # is not meant for this plugin sub xml_doctype { my $self = shift(@_); my ($expat, $name, $sysid, $pubid, $internal) = @_; die "" if ($name !~ /^rdf:RDF$/); my $outhandle = $self->{'outhandle'}; print $outhandle "CONTENTdmPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; } # Called for every start tag. The $_ variable will contain a copy of the # tag and the %_ variable will contain the element's attributes. sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; if ($element eq "rdf:Description") { my $about_key = $_{'about'}; my $rdf_desc = $self->{'rdf_desc'}; $rdf_desc->{$about_key} = {}; $self->{'about_key'} = $about_key; $self->{'index_text'} = ""; $self->{'pp_text'} = "\n"; } elsif (defined $self->{'about_key'}) { $self->{'metadata_name'} = $element; $self->{'metadata_value'} = ""; } } # Called for every end tag. The $_ variable will contain a copy of the tag. sub xml_end_tag { my $self = shift(@_); my ($expat, $element) = @_; if ($element eq "rdf:Description") { $self->{'pp_text'} .= "
\n"; ## ghtml::htmlsafe($self->{'pp_text'}); my $about_key = $self->{'about_key'}; my $about = $self->{'rdf_desc'}->{$about_key}; $about->{'IndexText'} = $self->{'index_text'}; $about->{'MetadataTable'} = $self->{'pp_text'}; $self->{'about_key'} = undef; $self->{'index_text'} = undef; $self->{'pp_text'} = undef; } elsif (defined $self->{'metadata_name'}) { my $metadata_name = $self->{'metadata_name'}; if ($element eq $metadata_name) { my $metadata_value = $self->{'metadata_value'}; my $about_key = $self->{'about_key'}; my $about = $self->{'rdf_desc'}->{$about_key}; $about->{$metadata_name} = $metadata_value; $self->{'index_text'} .= "$metadata_value\n"; $self->{'pp_text'} .= " $metadata_name$metadata_value\n"; $self->{'metadata_name'} = undef; $self->{'metadata_value'} = undef; } } } # Called just before start or end tags with accumulated non-markup text in # the $_ variable. sub xml_text { my $self = shift(@_); my ($expat) = @_; if (defined $self->{'metadata_name'}) { $self->{'metadata_value'} .= $_; } } # Called at the end of the XML document. sub xml_end_document { my $self = shift(@_); my ($expat) = @_; } 1;