########################################################################### # # HTMLPlugin.pm -- basic html plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # # Note that this plugin handles frames only in a very simple way # i.e. each frame is treated as a separate document. This means # search results will contain links to individual frames rather # than linking to the top level frameset. # There may also be some problems caused by the _parent target # (it's removed by this plugin) # package HTMLPlugin; use ReadTextFile; use HBPlugin; use ghtml; use unicode; use util; use XMLParser; use Image::Size; use File::Copy; sub BEGIN { @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin'); } use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasePlugin.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp() }, { 'name' => "block_exp", 'desc' => "{BasePlugin.block_exp}", 'type' => 'regexp', 'deft' => &get_default_block_exp() }, { 'name' => "nolinks", 'desc' => "{HTMLPlugin.nolinks}", 'type' => "flag" }, { 'name' => "keep_head", 'desc' => "{HTMLPlugin.keep_head}", 'type' => "flag" }, { 'name' => "no_metadata", 'desc' => "{HTMLPlugin.no_metadata}", 'type' => "flag" }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlugin.metadata_fields}", 'type' => "string", 'deft' => "Title" }, { 'name' => "hunt_creator_metadata", 'desc' => "{HTMLPlugin.hunt_creator_metadata}", 'type' => "flag" }, { 'name' => "file_is_url", 'desc' => "{HTMLPlugin.file_is_url}", 'type' => "flag" }, { 'name' => "assoc_files", 'desc' => "{HTMLPlugin.assoc_files}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "rename_assoc_files", 'desc' => "{HTMLPlugin.rename_assoc_files}", 'type' => "flag" }, { 'name' => "title_sub", 'desc' => "{HTMLPlugin.title_sub}", 'type' => "string", 'deft' => "" }, { 'name' => "description_tags", 'desc' => "{HTMLPlugin.description_tags}", 'type' => "flag" }, # retain this for backward compatibility (w3mir option was replaced by # file_is_url) { 'name' => "w3mir", # 'desc' => "{HTMLPlugin.w3mir}", 'type' => "flag", 'hiddengli' => "yes"}, { 'name' => "no_strip_metadata_html", 'desc' => "{HTMLPlugin.no_strip_metadata_html}", 'type' => "string", 'deft' => "", 'reqd' => "no"}, { 'name' => "sectionalise_using_h_tags", 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}", 'type' => "flag" }, { 'name' => "use_realistic_book", 'desc' => "{HTMLPlugin.tidy_html}", 'type' => "flag"}, { 'name' => "old_style_HDL", 'desc' => "{HTMLPlugin.old_style_HDL}", 'type' => "flag"} ]; my $options = { 'name' => "HTMLPlugin", 'desc' => "{HTMLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub HB_read_html_file { my $self = shift (@_); my ($htmlfile, $text) = @_; # load in the file if (!open (FILE, $htmlfile)) { print STDERR "ERROR - could not open $htmlfile\n"; return; } my $foundbody = 0; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; # just in case there was no tag if (!$foundbody) { $foundbody = 1; open (FILE, $htmlfile) || return; $self->HB_gettext (\$foundbody, $text, "FILE"); close FILE; } # text is in utf8 } # converts the text to utf8, as ghtml does that for é etc. sub HB_gettext { my $self = shift (@_); my ($foundbody, $text, $handle) = @_; my $line = ""; while (defined ($line = <$handle>)) { # look for body tag if (!$$foundbody) { if ($line =~ s/^.*]*>//i) { $$foundbody = 1; } else { next; } } # check for symbol fonts if ($line =~ m/]*?face\s*=\s*\"?(\w+)\"?/i) { my $font = $1; print STDERR "HBPlug::HB_gettext - warning removed font $font\n" if ($font !~ m/^arial$/i); } $$text .= $line; } if ($self->{'input_encoding'} eq "iso_8859_1") { # convert to utf-8 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); } # convert any alphanumeric character entities to their utf-8 # equivalent for indexing purposes #&ghtml::convertcharentities ($$text); $$text =~ s/\s+/ /g; # remove \n's } sub HB_clean_section { my $self = shift (@_); my ($section) = @_; # remove tags without a starting tag from the section my ($tag, $tagstart); while ($section =~ m/<\/([^>]{1,10})>/) { $tag = $1; $tagstart = index($section, "<$tag"); last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); $section =~ s/<\/$tag>//; } # remove extra paragraph tags while ($section =~ s/]*>\s*|||]*>| |\s)$//i) {} # add a newline at the beginning of each paragraph $section =~ s/(.)\s*]*?src=\"?([^\">]+)\"?[^>]*>/
<\/center>/ig; $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/
<\/center>/ig; return $section; } # Will convert the oldHDL format to the new HDL format (using the Section tag) sub convert_to_newHDLformat { my $self = shift (@_); my ($file,$cnfile) = @_; my $input_filename = $file; my $tmp_filename = $cnfile; # write HTML tmp file with new HDL format open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); # read in the file and do basic html cleaning (removing header etc) my $html = ""; $self->HB_read_html_file ($input_filename, \$html); # process the file one section at a time my $curtoclevel = 1; my $firstsection = 1; my $toclevel = 0; while (length ($html) > 0) { if ($html =~ s/^.*?(?:]*>)?((|||\s)*)<<TOC(\d+)>>\s*(.*?)]*>)?((|||\s)*)<<TOC\d+>>)/$2/i) { $sectiontext = $1; } else { $sectiontext = $html; $html = ""; } # remove tags and extra spaces from the title $title =~ s/<\/?[^>]+>//g; $title =~ s/^\s+|\s+$//g; # close any sections below the current level and # create a new section (special case for the firstsection) print PROD "\n"; print PROD "\n\n$title\n\n\n"; print PROD "\n"; # clean up the section html $sectiontext = $self->HB_clean_section($sectiontext); print PROD "$sectiontext\n"; } else { print STDERR "WARNING - leftover text\n" , $self->shorten($html), "\nin $input_filename\n"; last; } $firstsection = 0; } print PROD "\n"; close (PROD) || die("Error Closing File: $tmp_filename $!"); return $tmp_filename; } sub shorten { my $self = shift (@_); my ($text) = @_; return "\"$text\"" if (length($text) < 100); return "\"" . substr ($text, 0, 50) . "\" ... \"" . substr ($text, length($text)-50) . "\""; } sub convert_tidy_or_oldHDL_file { my $self = shift (@_); my ($file) = @_; my $input_filename = $file; if (-d $input_filename) { return $input_filename; } # get the input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); my $base_dirname = $dirname; $suffix = lc($suffix); # derive tmp filename from input filename # Remove any white space from filename -- no risk of name collision, and # makes later conversion by utils simpler. Leave spaces in path... # tidy up the filename with space, dot, hyphen between $tailname =~ s/\s+//g; $tailname =~ s/\.+//g; $tailname =~ s/\-+//g; # convert to utf-8 otherwise we have problems with the doc.xml file # later on &unicode::ensure_utf8(\$tailname); # softlink to collection tmp dir my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); my $test_dirname = ""; my $f_separator = &util::get_os_dirsep(); if ($dirname =~ m/import$f_separator/) { $test_dirname = $'; #' #print STDERR "init $'\n"; while ($test_dirname =~ m/[$f_separator]/) { my $folderdirname = $`; $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); $test_dirname = $'; #' } } my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) { #convert the input file to a new style HDL my $hdl_output_filename = $input_filename; if ($self->{'old_style_HDL'}) { $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); } #just for checking copy all other file from the base dir to tmp dir if it is not exists opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; my @files = grep {!/^\.+$/} readdir(DIR); close(DIR); foreach my $file (@files) { my $src_file = &util::filename_cat($base_dirname,$file); my $dest_file = &util::filename_cat($tmp_dirname,$file); if ((!-e $dest_file) && (!-d $src_file)) { # just copy the original file back to the tmp directory copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; } } # tidy the input file my $tidy_output_filename = $hdl_output_filename; if ($self->{'use_realistic_book'}) { $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); } $tmp_filename = $tidy_output_filename; } else { if (!-e $tmp_filename) { # just copy the original file back to the tmp directory copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; } } return $tmp_filename; } # Will make the html input file as a proper XML file with removed font tag and # image size added to the img tag. # The tidying process takes place in a collection specific 'tmp' directory so # that we don't accidentally damage the input. sub tmp_tidy_file { my $self = shift (@_); my ($file,$cnfile) = @_; my $input_filename = $file; my $tmp_filename = $cnfile; # get the input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); require HTML::TokeParser::Simple; # create HTML parser to decode the input file my $parser = HTML::TokeParser::Simple->new($input_filename); # write HTML tmp file without the font tag and image size are added to the img tag open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); while (my $token = $parser->get_token()) { # is it an img tag if ($token->is_start_tag('img')) { # get the attributes my $attr = $token->return_attr; # get the full path to the image my $img_file = &util::filename_cat($dirname,$attr->{src}); # set the width and height attribute ($attr->{width}, $attr->{height}) = imgsize($img_file); # recreate the tag print PROD ""; } # is it a font tag else { if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) { # remove font tag print PROD ""; } else { # print without changes print PROD $token->as_is; } } } close (PROD) || die("Error Closing File: $tmp_filename $!"); # run html-tidy on the tmp file to make it a proper XML file my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`; # write result back to the tmp file open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); print PROD $tidyfile; close (PROD) || die("Error Closing File: $tmp_filename $!"); # return the output filename return $tmp_filename; } sub read_into_doc_obj { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; # get the input file my $input_filename = $file; my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); $suffix = lc($suffix); if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) { # because the document has to be sectionalized set the description tags $self->{'description_tags'} = 1; # set the file to be tidied $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/; # get the tidied file #my $tidy_filename = $self->tmp_tidy_file($input_filename); my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename); # derive tmp filename from input filename my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$"); # set the new input file and base_dir to be from the tidied file $file = "$tailname$suffix"; $base_dir = $dirname; } # call the parent read_into_doc_obj my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli); return ($process_status,$doc_obj); } sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); if ($self->{'w3mir'}) { $self->{'file_is_url'} = 1; } $self->{'aux_files'} = {}; $self->{'dir_num'} = 0; $self->{'file_num'} = 0; return bless $self, $class; } # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ # if have eg @ @sig; $tmptext =~ s/<[^>]*>/ /g; $tmptext =~ s/ / /g; $tmptext =~ s/^\s+//; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext = &unicode::substr ($tmptext, 0, $size); $tmptext =~ s/\s\S*$/…/; # adds an ellipse (...) $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); } } sub extract_metadata { my $self = shift (@_); my ($textref, $metadata, $doc_obj, $section) = @_; my $outhandle = $self->{'outhandle'}; # if we don't want metadata, we may as well not be here ... return if (!defined $self->{'metadata_fields'}); # metadata fields to extract/save. 'key' is the (lowercase) name of the # html meta, 'value' is the metadata name for greenstone to use my %find_fields = (); my %creator_fields = (); # short-cut for lookups foreach my $field (split /,/, $self->{'metadata_fields'}) { $field =~ s/^\s+//; # remove leading whitespace $field =~ s/\s+$//; # remove trailing whitespace # support tag if ($field =~ m/^(.*?)<(.*?)>$/) { # "$2" is the user's preferred gs metadata name $find_fields{lc($1)}=$2; # lc = lowercase } else { # no for mapping # "$field" is the user's preferred gs metadata name $find_fields{lc($field)}=$field; # lc = lowercase } } if (defined $self->{'hunt_creator_metadata'} && $self->{'hunt_creator_metadata'} == 1 ) { my @extra_fields = ( 'author', 'author.email', 'creator', 'dc.creator', 'dc.creator.corporatename', ); # add the creator_metadata fields to search for foreach my $field (@extra_fields) { $creator_fields{$field}=0; # add to lookup hash } } # find the header in the html file, which has the meta tags $$textref =~ m@(.*?)@si; my $html_header=$1; # go through every " won't appear. (I don't think it's allowed to...) $html_header =~ m/^/; # match the start of the string, for \G assertion while ($html_header =~ m/\G.*?/sig) { my $metatag=$1; my ($tag, $value); # find the tag name $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; $tag=$2; # in case they're not using " or ', but they should... if (! $tag) { $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; $tag=$1; } if (!defined $tag) { print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n"; next; } # don't need to assign this field if it was passed in from a previous # (recursive) plugin if (defined $metadata->{$tag}) {next} # find the tag content $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is; $value=$2; if (! $value) { $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; $value=$1; } if (!defined $value) { print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n"; next; } # clean up and add $value =~ s/\s+/ /gs; chomp($value); # remove trailing \n, if any if (exists $creator_fields{lc($tag)}) { # map this value onto greenstone's "Creator" metadata $tag='Creator'; } elsif (!exists $find_fields{lc($tag)}) { next; # don't want this tag } else { # get the user's preferred capitalisation $tag = $find_fields{lc($tag)}; } if (lc($tag) eq "title") { $found_title = 1; } if ($self->{'verbosity'} > 2) { print $outhandle " extracted \"$tag\" metadata \"$value\"\n"; } # Do we still reply on the following? Surely there must # be a better way to go about this outside of the plugin? # #if ($tag =~ m/date.*/i){ # $tag = lc($tag); #} $doc_obj->add_utf8_metadata($section, $tag, $value); } # TITLE: extract the document title if (exists $find_fields{'title'} && !$found_title) { # we want a title, and didn't find one in the meta tags # see if there's a tag my $title; my $from = ""; # for debugging output only if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) { $title = $1; $from = "<title> tags"; } if (!defined $title) { $from = "first 100 chars"; # if no title use first 100 or so characters $title = $$textref; $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark $title =~ s/^.*?<body>//si; # ignore javascript! $title =~ s@<script.*?</script>@ @sig; $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space $title =~ s/<[^>]*>/ /g; # remove all HTML tags $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; } $title =~ s/<[^>]*>/ /g; # remove html tags $title =~ s/ / /g; $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... $title =~ s/\s+/ /gs; # collapse multiple spaces $title =~ s/^\s*//; # remove leading spaces $title =~ s/\s*$//; # remove trailing spaces $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $doc_obj->add_utf8_metadata ($section, 'Title', $title); print $outhandle " extracted Title metadata \"$title\" from $from\n" if ($self->{'verbosity'} > 2); } # add FileFormat metadata $doc_obj->add_metadata($section,"FileFormat", "HTML"); # Special, for metadata names such as tagH1 - extracts # the text between the first <H1> and </H1> tags into "H1" metadata. foreach my $field (keys %find_fields) { if ($field !~ m/^tag([a-z0-9]+)$/i) {next} my $tag = $1; if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { my $content = $1; $content =~ s/ / /g; $content =~ s/<[^>]*>/ /g; $content =~ s/^\s+//; $content =~ s/\s+$//; $content =~ s/\s+/ /gs; if ($content) { $tag=$find_fields{"tag$tag"}; # get the user's capitalisation $tag =~ s/^tag//i; $doc_obj->add_utf8_metadata ($section, $tag, $content); print $outhandle " extracted \"$tag\" metadata \"$content\"\n" if ($self->{'verbosity'} > 2); } } } } # evaluate any "../" to next directory up # evaluate any "./" as here sub eval_dir_dots { my $self = shift (@_); my ($filename) = @_; my $dirsep_os = &util::get_os_dirsep(); my @dirsep = split(/$dirsep_os/,$filename); my @eval_dirs = (); foreach my $d (@dirsep) { if ($d eq "..") { pop(@eval_dirs); } elsif ($d eq ".") { # do nothing! } else { push(@eval_dirs,$d); } } # Need to fiddle with number of elements in @eval_dirs if the # first one is the empty string. This is because of a # modification to util::filename_cat that supresses the addition # of a leading '/' character (or \ if windows) (intended to help # filename cat with relative paths) if the first entry in the # array is the empty string. Making the array start with *two* # empty strings is a way to defeat this "smart" option. # if (scalar(@eval_dirs) > 0) { if ($eval_dirs[0] eq ""){ unshift(@eval_dirs,""); } } my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : ""; return $evaluated_filename; } sub replace_usemap_links { my $self = shift (@_); my ($front, $link, $back) = @_; # remove quotes from link at start and end if necessary if ($link=~/^[\"\']/) { $link=~s/^[\"\']//; $link=~s/[\"\']$//; $front.='"'; $back="\"$back"; } $link =~ s/^\.\///; return $front . $link . $back; } sub inc_filecount { my $self = shift (@_); if ($self->{'file_num'} == 1000) { $self->{'dir_num'} ++; $self->{'file_num'} = 0; } else { $self->{'file_num'} ++; } } # Extend read_file so that strings like é are # converted to UTF8 internally. # # We don't convert < or > or & or " in case # they interfere with the GML files sub read_file { my $self = shift(@_); my ($filename, $encoding, $language, $textref) = @_; $self->SUPER::read_file($filename, $encoding, $language, $textref); # Convert entities to their UTF8 equivalents $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo; $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; } 1;