########################################################################### # # HTMLPlug.pm -- basic html plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # # Note that this plugin handles frames only in a very simple way # i.e. each frame is treated as a separate document. This means # search results will contain links to individual frames rather # than linking to the top level frameset. # There may also be some problems caused by the _parent target # (it's removed by this plugin) # package HTMLPlug; use BasPlug; use ghtml; use unicode; use util; use XMLParser; sub BEGIN { @HTMLPlug::ISA = ('BasPlug'); } use strict; # every perl program should have this! no strict 'refs'; # make an exception so we can use variables as filehandles my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp() }, { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => 'regexp', 'deft' => &get_default_block_exp() }, { 'name' => "nolinks", 'desc' => "{HTMLPlug.nolinks}", 'type' => "flag" }, { 'name' => "keep_head", 'desc' => "{HTMLPlug.keep_head}", 'type' => "flag" }, { 'name' => "extract_style", 'desc' => "{HTMLPlug.extract_style}", 'type' => "flag" }, { 'name' => "no_metadata", 'desc' => "{HTMLPlug.no_metadata}", 'type' => "flag" }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlug.metadata_fields}", 'type' => "string", 'deft' => "Title" }, { 'name' => "hunt_creator_metadata", 'desc' => "{HTMLPlug.hunt_creator_metadata}", 'type' => "flag" }, { 'name' => "file_is_url", 'desc' => "{HTMLPlug.file_is_url}", 'type' => "flag" }, { 'name' => "assoc_files", 'desc' => "{HTMLPlug.assoc_files}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "rename_assoc_files", 'desc' => "{HTMLPlug.rename_assoc_files}", 'type' => "flag" }, { 'name' => "title_sub", 'desc' => "{HTMLPlug.title_sub}", 'type' => "string", 'deft' => "" }, { 'name' => "description_tags", 'desc' => "{HTMLPlug.description_tags}", 'type' => "flag" }, # retain this for backward compatibility (w3mir option was replaced by # file_is_url) { 'name' => "w3mir", # 'desc' => "{HTMLPlug.w3mir}", 'type' => "flag", 'hiddengli' => "yes"}, { 'name' => "no_strip_metadata_html", 'desc' => "{HTMLPlug.no_strip_metadata_html}", 'type' => "string", 'deft' => "", 'reqd' => "no"}, { 'name' => "sectionalise_using_h_tags", 'desc' => "{HTMLPlug.sectionalise_using_h_tags}", 'type' => "flag" } ]; my $options = { 'name' => "HTMLPlug", 'desc' => "{HTMLPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); if ($self->{'w3mir'}) { $self->{'file_is_url'} = 1; } $self->{'aux_files'} = {}; $self->{'dir_num'} = 0; $self->{'file_num'} = 0; return bless $self, $class; } # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ # if have eg @ @sig; $tmptext =~ s/<[^>]*>/ /g; $tmptext =~ s/ / /g; $tmptext =~ s/^\s+//; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext = &unicode::substr ($tmptext, 0, $size); $tmptext =~ s/\s\S*$/…/; # adds an ellipse (...) $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); } } sub extract_metadata { my $self = shift (@_); my ($textref, $metadata, $doc_obj, $section) = @_; my $outhandle = $self->{'outhandle'}; # if we don't want metadata, we may as well not be here ... return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0); # metadata fields to extract/save. 'key' is the (lowercase) name of the # html meta, 'value' is the metadata name for greenstone to use my %find_fields = (); my %creator_fields = (); # short-cut for lookups foreach my $field (split /,/, $self->{'metadata_fields'}) { # support tag if ($field =~ /^(.*?)<(.*?)>$/) { # "$2" is the user's preferred gs metadata name $find_fields{lc($1)}=$2; # lc = lowercase } else { # no for mapping # "$field" is the user's preferred gs metadata name $find_fields{lc($field)}=$field; # lc = lowercase } } if ($self->{'hunt_creator_metadata'} == 1 ) { my @extra_fields = ( 'author', 'author.email', 'creator', 'dc.creator', 'dc.creator.corporatename', ); # add the creator_metadata fields to search for foreach my $field (@extra_fields) { $creator_fields{$field}=0; # add to lookup hash } } # find the header in the html file, which has the meta tags $$textref =~ m@(.*?)@si; my $html_header=$1; # go through every " won't appear. (I don't think it's allowed to...) $html_header =~ /^/; # match the start of the string, for \G assertion while ($html_header =~ m/\G.*?/sig) { my $metatag=$1; my ($tag, $value); # find the tag name $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; $tag=$2; # in case they're not using " or ', but they should... if (! $tag) { $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; $tag=$1; } if (!defined $tag) { print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n"; next; } # don't need to assign this field if it was passed in from a previous # (recursive) plugin if (defined $metadata->{$tag}) {next} # find the tag content $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is; $value=$2; if (! $value) { $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; $value=$1; } if (!defined $value) { print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n"; next; } # clean up and add $value =~ s/\s+/ /gs; chomp($value); # remove trailing \n, if any if (exists $creator_fields{lc($tag)}) { # map this value onto greenstone's "Creator" metadata $tag='Creator'; } elsif (!exists $find_fields{lc($tag)}) { next; # don't want this tag } else { # get the user's preferred capitalisation $tag = $find_fields{lc($tag)}; } if (lc($tag) eq "title") { $found_title = 1; } print $outhandle " extracted \"$tag\" metadata \"$value\"\n" if ($self->{'verbosity'} > 2); if ($tag =~ /date.*/i){ $tag = lc($tag); } $doc_obj->add_utf8_metadata($section, $tag, $value); } # TITLE: extract the document title if (exists $find_fields{'title'} && !$found_title) { # we want a title, and didn't find one in the meta tags # see if there's a tag my $title; my $from = ""; # for debugging output only if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) { $title = $1; $from = "<title> tags"; } if (!defined $title) { $from = "first 100 chars"; # if no title use first 100 or so characters $title = $$textref; $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark $title =~ s/^.*?<body>//si; # ignore javascript! $title =~ s@<script.*?</script>@ @sig; $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space $title =~ s/<[^>]*>/ /g; # remove all HTML tags $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; } $title =~ s/<[^>]*>/ /g; # remove html tags $title =~ s/ / /g; $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... $title =~ s/\s+/ /gs; # collapse multiple spaces $title =~ s/^\s*//; # remove leading spaces $title =~ s/\s*$//; # remove trailing spaces $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $doc_obj->add_utf8_metadata ($section, 'Title', $title); print $outhandle " extracted Title metadata \"$title\" from $from\n" if ($self->{'verbosity'} > 2); } # add FileFormat metadata $doc_obj->add_metadata($section,"FileFormat", "HTML"); # Special, for metadata names such as tagH1 - extracts # the text between the first <H1> and </H1> tags into "H1" metadata. foreach my $field (keys %find_fields) { if ($field !~ /^tag([a-z0-9]+)$/i) {next} my $tag = $1; if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { my $content = $1; $content =~ s/ / /g; $content =~ s/<[^>]*>/ /g; $content =~ s/^\s+//; $content =~ s/\s+$//; $content =~ s/\s+/ /gs; if ($content) { $tag=$find_fields{"tag$tag"}; # get the user's capitalisation $tag =~ s/^tag//i; $doc_obj->add_utf8_metadata ($section, $tag, $content); print $outhandle " extracted \"$tag\" metadata \"$content\"\n" if ($self->{'verbosity'} > 2); } } } } sub extract_style { my $self = shift (@_); my ($textref, $doc_obj, $section, $base_dir, $file) = @_; my $outhandle = $self->{'outhandle'}; # find the header in the html file, which has the style info $$textref =~ m@<head>(.*?)</head>@si; my $html_header=$1; my $style_contents = ""; # look for style tags $html_header =~ /^/; # match the start of the string, for \G assertion while ($html_header =~ m/\G.*?<(style|script|link)/sig) { my $tag_name = $1; if ($tag_name eq "style") { if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) { $style_contents .= "\n<style"; $style_contents .= $1; } } elsif ($tag_name eq "link") { $style_contents .= "\n<link"; $html_header =~ m/\G(.*?>)/is; $style_contents .= $1; } elsif ($tag_name eq "script") { # bit more tricky cos it may or may not have content if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) { $style_contents .= "\n<script"; $style_contents .= $1; } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) { $style_contents .= "\n<script"; $style_contents .= $1; } } } # now we need to do something with any links found in the style thing $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge; $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents); } # evaluate any "../" to next directory up # evaluate any "./" as here sub eval_dir_dots { my $self = shift (@_); my ($filename) = @_; my $dirsep_os = &util::get_os_dirsep(); my @dirsep = split(/$dirsep_os/,$filename); my @eval_dirs = (); foreach my $d (@dirsep) { if ($d eq "..") { pop(@eval_dirs); } elsif ($d eq ".") { # do nothing! } else { push(@eval_dirs,$d); } } # Need to fiddle with number of elements in @eval_dirs if the # first one is the empty string. This is because of a # modification to util::filename_cat that supresses the addition # of a leading '/' character (or \ if windows) (intended to help # filename cat with relative paths) if the first entry in the # array is the empty string. Making the array start with *two* # empty strings is a way to defeat this "smart" option. # if (scalar(@eval_dirs) > 0) { if ($eval_dirs[0] eq ""){ unshift(@eval_dirs,""); } } return &util::filename_cat(@eval_dirs); } sub replace_usemap_links { my $self = shift (@_); my ($front, $link, $back) = @_; $link =~ s/^\.\///; return $front . $link . $back; } sub inc_filecount { my $self = shift (@_); if ($self->{'file_num'} == 1000) { $self->{'dir_num'} ++; $self->{'file_num'} = 0; } else { $self->{'file_num'} ++; } } # Extend the BasPlug read_file so that strings like é are # converted to UTF8 internally. # # We don't convert < or > or & or " in case # they interfere with the GML files sub read_file { my ($self, $filename, $encoding, $language, $textref) = @_; &BasPlug::read_file($self, $filename, $encoding, $language, $textref); # Convert entities to their UTF8 equivalents $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo; $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; } 1;