########################################################################### # # HTMLPlugin.pm -- basic html plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # # Note that this plugin handles frames only in a very simple way # i.e. each frame is treated as a separate document. This means # search results will contain links to individual frames rather # than linking to the top level frameset. # There may also be some problems caused by the _parent target # (it's removed by this plugin) # package HTMLPlugin; use Encode; use Unicode::Normalize 'normalize'; use ReadTextFile; use HBPlugin; use ghtml; use unicode; use util; use FileUtils; use XMLParser; use File::Copy; sub BEGIN { @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin'); die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); # for Image/Size.pm } use Image::Size; use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp() }, { 'name' => "block_exp", 'desc' => "{CommonUtil.block_exp}", 'type' => 'regexp', 'deft' => &get_default_block_exp() }, { 'name' => "nolinks", 'desc' => "{HTMLPlugin.nolinks}", 'type' => "flag" }, { 'name' => "keep_head", 'desc' => "{HTMLPlugin.keep_head}", 'type' => "flag" }, { 'name' => "no_metadata", 'desc' => "{HTMLPlugin.no_metadata}", 'type' => "flag" }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlugin.metadata_fields}", 'type' => "string", 'deft' => "Title" }, { 'name' => "metadata_field_separator", 'desc' => "{HTMLPlugin.metadata_field_separator}", 'type' => "string", 'deft' => "" }, { 'name' => "hunt_creator_metadata", 'desc' => "{HTMLPlugin.hunt_creator_metadata}", 'type' => "flag" }, { 'name' => "file_is_url", 'desc' => "{HTMLPlugin.file_is_url}", 'type' => "flag" }, { 'name' => "assoc_files", 'desc' => "{HTMLPlugin.assoc_files}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "rename_assoc_files", 'desc' => "{HTMLPlugin.rename_assoc_files}", 'type' => "flag" }, { 'name' => "title_sub", 'desc' => "{HTMLPlugin.title_sub}", 'type' => "string", 'deft' => "" }, { 'name' => "description_tags", 'desc' => "{HTMLPlugin.description_tags}", 'type' => "flag" }, # retain this for backward compatibility (w3mir option was replaced by # file_is_url) { 'name' => "w3mir", # 'desc' => "{HTMLPlugin.w3mir}", 'type' => "flag", 'hiddengli' => "yes"}, { 'name' => "no_strip_metadata_html", 'desc' => "{HTMLPlugin.no_strip_metadata_html}", 'type' => "string", 'deft' => "", 'reqd' => "no"}, { 'name' => "sectionalise_using_h_tags", 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}", 'type' => "flag" }, { 'name' => "use_realistic_book", 'desc' => "{HTMLPlugin.tidy_html}", 'type' => "flag"}, { 'name' => "old_style_HDL", 'desc' => "{HTMLPlugin.old_style_HDL}", 'type' => "flag"}, {'name' => "processing_tmp_files", 'desc' => "{BaseImporter.processing_tmp_files}", 'type' => "flag", 'hiddengli' => "yes"} ]; my $options = { 'name' => "HTMLPlugin", 'desc' => "{HTMLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); if ($self->{'w3mir'}) { $self->{'file_is_url'} = 1; } $self->{'aux_files'} = {}; $self->{'dir_num'} = 0; $self->{'file_num'} = 0; return bless $self, $class; } # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ # if have eg @ @sig; $tmptext =~ s/<[^>]*>/ /g; $tmptext =~ s/ / /g; $tmptext =~ s/^\s+//; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext = &unicode::substr ($tmptext, 0, $size); $tmptext =~ s/\s\S*$/…/; # adds an ellipse (...) $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); } } sub extract_metadata { my $self = shift (@_); my ($textref, $metadata, $doc_obj, $section) = @_; my $outhandle = $self->{'outhandle'}; # if we don't want metadata, we may as well not be here ... return if (!defined $self->{'metadata_fields'}); my $separator = $self->{'metadata_field_separator'}; if ($separator eq "") { undef $separator; } # metadata fields to extract/save. 'key' is the (lowercase) name of the # html meta, 'value' is the metadata name for greenstone to use my %find_fields = (); my %creator_fields = (); # short-cut for lookups foreach my $field (split /,/, $self->{'metadata_fields'}) { $field =~ s/^\s+//; # remove leading whitespace $field =~ s/\s+$//; # remove trailing whitespace # support tag if ($field =~ m/^(.*?)\s*<(.*?)>$/) { # "$2" is the user's preferred gs metadata name $find_fields{lc($1)}=$2; # lc = lowercase } else { # no for mapping # "$field" is the user's preferred gs metadata name $find_fields{lc($field)}=$field; # lc = lowercase } } if (defined $self->{'hunt_creator_metadata'} && $self->{'hunt_creator_metadata'} == 1 ) { my @extra_fields = ( 'author', 'author.email', 'creator', 'dc.creator', 'dc.creator.corporatename', ); # add the creator_metadata fields to search for foreach my $field (@extra_fields) { $creator_fields{$field}=0; # add to lookup hash } } # find the header in the html file, which has the meta tags $$textref =~ m@(.*?)@si; my $html_header=$1; # go through every " won't appear. (I don't think it's allowed to...) $html_header =~ m/^/; # match the start of the string, for \G assertion while ($html_header =~ m/\G.*?/sig) { my $metatag=$1; my ($tag, $value); # find the tag name $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; $tag=$2; # in case they're not using " or ', but they should... if (! $tag) { $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; $tag=$1; } if (!defined $tag) { print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n"; next; } # don't need to assign this field if it was passed in from a previous # (recursive) plugin if (defined $metadata->{$tag}) {next} # find the tag content $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is; $value=$2; # The following code assigns the metaname to value if value is # empty. Why would we do this? #if (! $value) { # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; # $value=$1; #} if (!defined $value || $value eq "") { print $outhandle "HTMLPlugin: can't find VALUE in \n" if ($self->{'verbosity'} > 2); next; } # clean up and add $value =~ s/\s+/ /gs; chomp($value); # remove trailing \n, if any if (exists $creator_fields{lc($tag)}) { # map this value onto greenstone's "Creator" metadata $tag='Creator'; } elsif (!exists $find_fields{lc($tag)}) { next; # don't want this tag } else { # get the user's preferred capitalisation $tag = $find_fields{lc($tag)}; } if (lc($tag) eq "title") { $found_title = 1; } if ($self->{'verbosity'} > 2) { print $outhandle " extracted \"$tag\" metadata \"$value\"\n"; } if ($tag =~ /\./) { # there is a . so has a namespace, add ex. $tag = "ex.$tag"; } if (defined $separator) { my @values = split($separator, $value); foreach my $v (@values) { $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/; } } else { $doc_obj->add_utf8_metadata($section, $tag, $value); } } # TITLE: extract the document title if (exists $find_fields{'title'} && !$found_title) { # we want a title, and didn't find one in the meta tags # see if there's a tag my $title; my $from = ""; # for debugging output only if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) { $title = $1; $from = "<title> tags"; } if (!defined $title) { $from = "first 100 chars"; # if no title use first 100 or so characters $title = $$textref; $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark $title =~ s/^.*?<body>//si; # ignore javascript! $title =~ s@<script.*?</script>@ @sig; $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space $title =~ s/<[^>]*>/ /g; # remove all HTML tags $title =~ s@\r@@g; # remove Windows carriage returns to ensure that titles of pdftohtml docs are consistent (the same 100 chars) across windows and linux $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; } $title =~ s/<[^>]*>/ /g; # remove html tags $title =~ s/ / /g; $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... $title =~ s/\s+/ /gs; # collapse multiple spaces $title =~ s/^\s*//; # remove leading spaces $title =~ s/\s*$//; # remove trailing spaces $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $doc_obj->add_utf8_metadata ($section, "Title", $title); print $outhandle " extracted Title metadata \"$title\" from $from\n" if ($self->{'verbosity'} > 2); } # add FileFormat metadata $doc_obj->add_metadata($section,"FileFormat", "HTML"); # Special, for metadata names such as tagH1 - extracts # the text between the first <H1> and </H1> tags into "H1" metadata. foreach my $field (keys %find_fields) { if ($field !~ m/^tag([a-z0-9]+)$/i) {next} my $tag = $1; if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { my $content = $1; $content =~ s/ / /g; $content =~ s/<[^>]*>/ /g; $content =~ s/^\s+//; $content =~ s/\s+$//; $content =~ s/\s+/ /gs; if ($content) { $tag=$find_fields{"tag$tag"}; # get the user's capitalisation $tag =~ s/^tag//i; $doc_obj->add_utf8_metadata ($section, $tag, $content); print $outhandle " extracted \"$tag\" metadata \"$content\"\n" if ($self->{'verbosity'} > 2); } } } } # evaluate any "../" to next directory up # evaluate any "./" as here sub eval_dir_dots { my $self = shift (@_); my ($filename) = @_; my $dirsep_os = &util::get_os_dirsep(); my @dirsep = split(/$dirsep_os/,$filename); my @eval_dirs = (); foreach my $d (@dirsep) { if ($d eq "..") { pop(@eval_dirs); } elsif ($d eq ".") { # do nothing! } else { push(@eval_dirs,$d); } } # Need to fiddle with number of elements in @eval_dirs if the # first one is the empty string. This is because of a # modification to FileUtils::filenameConcatenate that supresses the addition # of a leading '/' character (or \ if windows) (intended to help # filename cat with relative paths) if the first entry in the # array is the empty string. Making the array start with *two* # empty strings is a way to defeat this "smart" option. # if (scalar(@eval_dirs) > 0) { if ($eval_dirs[0] eq ""){ unshift(@eval_dirs,""); } } my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : ""; return $evaluated_filename; } sub replace_usemap_links { my $self = shift (@_); my ($front, $link, $back) = @_; # remove quotes from link at start and end if necessary if ($link=~/^[\"\']/) { $link=~s/^[\"\']//; $link=~s/[\"\']$//; $front.='"'; $back="\"$back"; } $link =~ s/^\.\///; return $front . $link . $back; } sub inc_filecount { my $self = shift (@_); if ($self->{'file_num'} == 1000) { $self->{'dir_num'} ++; $self->{'file_num'} = 0; } else { $self->{'file_num'} ++; } } # Extend read_file so that strings like é are # converted to UTF8 internally. # # We don't convert < or > or & or " in case # they interfere with the GML files sub read_file { my $self = shift(@_); my ($filename, $encoding, $language, $textref) = @_; $self->SUPER::read_file($filename, $encoding, $language, $textref); # Convert entities to their Unicode code-point equivalents $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo; $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; } sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { print STDERR "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; # just in case there was no <body> tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; } # text is in utf8 } # converts the text to utf8, as ghtml does that for é etc. sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*<body[^>]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print STDERR "HBPlug::HB_gettext - warning removed font $font\n" if ($font !~ m/^arial$/i); } $$text .= $line; } if ($self->{'input_encoding'} eq "iso_8859_1") { # convert to utf-8 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); } # convert any alphanumeric character entities to their utf-8 # equivalent for indexing purposes #&ghtml::convertcharentities ($$text); $$text =~ s/\s+/ /g; # remove \n's # At this point $$text is a binary byte string # => turn it into a Unicode aware string, so full # Unicode aware pattern matching can be used. # For instance: 's/\x{0101}//g' or '[[:upper:]]' # $$text = decode("utf8",$$text); } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ m/<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} # remove extra stuff at the end of the section while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; # add a newline every 80 characters at a word boundary # Note: this regular expression puts a line feed before # the last word in each section, even when it is not # needed. $section =~ s/(.{1,80})\s/$1\n/g; # fix up the image links $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ <center><img src=\"$1\" \/><\/center><br\/>/ig; $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/ <center><img src=\"$1\" \/><\/center><br\/>/ig; return $section; } # Will convert the oldHDL format to the new HDL format (using the Section tag) sub convert_to_newHDLformat { my $self = shift (@_); my ($file,$cnfile) = @_; my $input_filename = $file; my $tmp_filename = $cnfile; # write HTML tmp file with new HDL format open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); # read in the file and do basic html cleaning (removing header etc) my $html = ""; $self->HB_read_html_file ($input_filename, \$html); # process the file one section at a time my $curtoclevel = 1; my $firstsection = 1; my $toclevel = 0; while (length ($html) > 0) { if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC(\d+)>>\s*(.*?)<p\b/<p/i) { $toclevel = $3; my $title = $4; my $sectiontext = ""; if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) print PROD "<!--\n"; while (($curtoclevel > $toclevel) || (!$firstsection && $curtoclevel == $toclevel)) { $curtoclevel--; print PROD "</Section>\n"; } if ($curtoclevel+1 < $toclevel) { print STDERR "WARNING - jump in toc levels in $input_filename " . "from $curtoclevel to $toclevel\n"; } while ($curtoclevel < $toclevel) { $curtoclevel++; } if ($curtoclevel == 1) { # add the header tag print PROD "-->\n"; print PROD "<HTML>\n<HEAD>\n<TITLE>$title\n\n\n"; print PROD "\n"; # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); print PROD "$sectiontext\n"; } else { print STDERR "WARNING - leftover text\n" , $self->shorten($html), "\nin $input_filename\n"; last; } $firstsection = 0; } print PROD "\n"; close (PROD) || die("Error Closing File: $tmp_filename $!"); return $tmp_filename; } sub shorten { my $self = shift (@_); my ($text) = @_; return "\"$text\"" if (length($text) < 100); return "\"" . substr ($text, 0, 50) . "\" ... \"" . substr ($text, length($text)-50) . "\""; } sub convert_tidy_or_oldHDL_file { my $self = shift (@_); my ($file) = @_; my $input_filename = $file; if (-d $input_filename) { return $input_filename; } # get the input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); my $base_dirname = $dirname; $suffix = lc($suffix); # derive tmp filename from input filename # Remove any white space from filename -- no risk of name collision, and # makes later conversion by utils simpler. Leave spaces in path... # tidy up the filename with space, dot, hyphen between $tailname =~ s/\s+//g; $tailname =~ s/\.+//g; $tailname =~ s/\-+//g; # convert to utf-8 otherwise we have problems with the doc.xml file # later on &unicode::ensure_utf8(\$tailname); # softlink to collection tmp dir my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname); my $test_dirname = ""; my $f_separator = &util::get_os_dirsep(); if ($dirname =~ m/import$f_separator/) { $test_dirname = $'; #' #print STDERR "init $'\n"; while ($test_dirname =~ m/[$f_separator]/) { my $folderdirname = $`; $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname); &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname); $test_dirname = $'; #' } } my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix"); # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) { #convert the input file to a new style HDL my $hdl_output_filename = $input_filename; if ($self->{'old_style_HDL'}) { $hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix"); $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); } #just for checking copy all other file from the base dir to tmp dir if it is not exists opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; my @files = grep {!/^\.+$/} readdir(DIR); close(DIR); foreach my $file (@files) { my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file); my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file); if ((!-e $dest_file) && (!-d $src_file)) { # just copy the original file back to the tmp directory copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; } } # tidy the input file my $tidy_output_filename = $hdl_output_filename; if ($self->{'use_realistic_book'}) { $tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix"); $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); } $tmp_filename = $tidy_output_filename; } else { if (!-e $tmp_filename) { # just copy the original file back to the tmp directory copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; } } return $tmp_filename; } # Will make the html input file as a proper XML file with removed font tag and # image size added to the img tag. # The tidying process takes place in a collection specific 'tmp' directory so # that we don't accidentally damage the input. sub tmp_tidy_file { my $self = shift (@_); my ($file,$cnfile) = @_; my $input_filename = $file; my $tmp_filename = $cnfile; # get the input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); require HTML::TokeParser::Simple; # create HTML parser to decode the input file my $parser = HTML::TokeParser::Simple->new($input_filename); # write HTML tmp file without the font tag and image size are added to the img tag open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); while (my $token = $parser->get_token()) { # is it an img tag if ($token->is_start_tag('img')) { # get the attributes my $attr = $token->return_attr; # get the full path to the image my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src}); # set the width and height attribute ($attr->{width}, $attr->{height}) = imgsize($img_file); # recreate the tag print PROD ""; } # is it a font tag else { if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) { # remove font tag print PROD ""; } else { # print without changes print PROD $token->as_is; } } } close (PROD) || die("Error Closing File: $tmp_filename $!"); # run html-tidy on the tmp file to make it a proper XML file my $outhandle = $self->{'outhandle'}; print $outhandle "Converting HTML to be XML compliant:\n"; my $tidy_cmd = "tidy"; $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2); $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\""; if ($self->{'verbosity'} <= 2) { if ($ENV{'GSDLOS'} =~ m/^windows/i) { $tidy_cmd .= " 2>nul"; } else { $tidy_cmd .= " 2>/dev/null"; } print $outhandle " => $tidy_cmd\n"; } my $tidyfile = `$tidy_cmd`; # write result back to the tmp file open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); print PROD $tidyfile; close (PROD) || die("Error Closing File: $tmp_filename $!"); # return the output filename return $tmp_filename; } sub associate_cover_image { my $self = shift(@_); my ($doc_obj, $filename) = @_; if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) { # we will have cover image in tidytmp, but want it from import $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/; } $self->SUPER::associate_cover_image($doc_obj, $filename); } 1;