###########################################################################
#
# HTMLPlug.pm -- basic html plugin
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
#
# Note that this plugin handles frames only in a very simple way
# i.e. each frame is treated as a separate document. This means
# search results will contain links to individual frames rather
# than linking to the top level frameset.
# There may also be some problems caused by the _parent target
# (it's removed by this plugin)
#
package HTMLPlug;
use BasPlug;
use ghtml;
use unicode;
use util;
use XMLParser;
sub BEGIN {
@HTMLPlug::ISA = ('BasPlug');
}
use strict; # every perl program should have this!
no strict 'refs'; # make an exception so we can use variables as filehandles
my $arguments =
[ { 'name' => "process_exp",
'desc' => "{BasPlug.process_exp}",
'type' => "regexp",
'deft' => &get_default_process_exp() },
{ 'name' => "block_exp",
'desc' => "{BasPlug.block_exp}",
'type' => 'regexp',
'deft' => &get_default_block_exp() },
{ 'name' => "nolinks",
'desc' => "{HTMLPlug.nolinks}",
'type' => "flag" },
{ 'name' => "keep_head",
'desc' => "{HTMLPlug.keep_head}",
'type' => "flag" },
{ 'name' => "extract_style",
'desc' => "{HTMLPlug.extract_style}",
'type' => "flag" },
{ 'name' => "no_metadata",
'desc' => "{HTMLPlug.no_metadata}",
'type' => "flag" },
{ 'name' => "metadata_fields",
'desc' => "{HTMLPlug.metadata_fields}",
'type' => "string",
'deft' => "Title" },
{ 'name' => "hunt_creator_metadata",
'desc' => "{HTMLPlug.hunt_creator_metadata}",
'type' => "flag" },
{ 'name' => "file_is_url",
'desc' => "{HTMLPlug.file_is_url}",
'type' => "flag" },
{ 'name' => "assoc_files",
'desc' => "{HTMLPlug.assoc_files}",
'type' => "regexp",
'deft' => &get_default_block_exp() },
{ 'name' => "rename_assoc_files",
'desc' => "{HTMLPlug.rename_assoc_files}",
'type' => "flag" },
{ 'name' => "title_sub",
'desc' => "{HTMLPlug.title_sub}",
'type' => "string",
'deft' => "" },
{ 'name' => "description_tags",
'desc' => "{HTMLPlug.description_tags}",
'type' => "flag" },
# retain this for backward compatibility (w3mir option was replaced by
# file_is_url)
{ 'name' => "w3mir",
# 'desc' => "{HTMLPlug.w3mir}",
'type' => "flag",
'hiddengli' => "yes"},
{ 'name' => "no_strip_metadata_html",
'desc' => "{HTMLPlug.no_strip_metadata_html}",
'type' => "string",
'deft' => "",
'reqd' => "no"},
{ 'name' => "sectionalise_using_h_tags",
'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
'type' => "flag" }
];
my $options = { 'name' => "HTMLPlug",
'desc' => "{HTMLPlug.desc}",
'abstract' => "no",
'inherits' => "yes",
'args' => $arguments };
sub new {
my ($class) = shift (@_);
my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
push(@$pluginlist, $class);
if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
if ($self->{'w3mir'}) {
$self->{'file_is_url'} = 1;
}
$self->{'aux_files'} = {};
$self->{'dir_num'} = 0;
$self->{'file_num'} = 0;
return bless $self, $class;
}
# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
# if have eg @ @sig;
$tmptext =~ s/<[^>]*>/ /g;
$tmptext =~ s/ / /g;
$tmptext =~ s/^\s+//;
$tmptext =~ s/\s+$//;
$tmptext =~ s/\s+/ /gs;
$tmptext = &unicode::substr ($tmptext, 0, $size);
$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
}
}
sub extract_metadata {
my $self = shift (@_);
my ($textref, $metadata, $doc_obj, $section) = @_;
my $outhandle = $self->{'outhandle'};
# if we don't want metadata, we may as well not be here ...
return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
# metadata fields to extract/save. 'key' is the (lowercase) name of the
# html meta, 'value' is the metadata name for greenstone to use
my %find_fields = ();
my %creator_fields = (); # short-cut for lookups
foreach my $field (split /,/, $self->{'metadata_fields'}) {
# support tag
if ($field =~ /^(.*?)<(.*?)>$/) {
# "$2" is the user's preferred gs metadata name
$find_fields{lc($1)}=$2; # lc = lowercase
} else { # no for mapping
# "$field" is the user's preferred gs metadata name
$find_fields{lc($field)}=$field; # lc = lowercase
}
}
if ($self->{'hunt_creator_metadata'} == 1 ) {
my @extra_fields =
(
'author',
'author.email',
'creator',
'dc.creator',
'dc.creator.corporatename',
);
# add the creator_metadata fields to search for
foreach my $field (@extra_fields) {
$creator_fields{$field}=0; # add to lookup hash
}
}
# find the header in the html file, which has the meta tags
$$textref =~ m@(.*?)@si;
my $html_header=$1;
# go through every " won't appear. (I don't think it's allowed to...)
$html_header =~ /^/; # match the start of the string, for \G assertion
while ($html_header =~ m/\G.*?/sig) {
my $metatag=$1;
my ($tag, $value);
# find the tag name
$metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
$tag=$2;
# in case they're not using " or ', but they should...
if (! $tag) {
$metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
$tag=$1;
}
if (!defined $tag) {
print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
next;
}
# don't need to assign this field if it was passed in from a previous
# (recursive) plugin
if (defined $metadata->{$tag}) {next}
# find the tag content
$metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
$value=$2;
if (! $value) {
$metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
$value=$1;
}
if (!defined $value) {
print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
next;
}
# clean up and add
$value =~ s/\s+/ /gs;
chomp($value); # remove trailing \n, if any
if (exists $creator_fields{lc($tag)}) {
# map this value onto greenstone's "Creator" metadata
$tag='Creator';
} elsif (!exists $find_fields{lc($tag)}) {
next; # don't want this tag
} else {
# get the user's preferred capitalisation
$tag = $find_fields{lc($tag)};
}
if (lc($tag) eq "title") {
$found_title = 1;
}
print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
if ($self->{'verbosity'} > 2);
if ($tag =~ /date.*/i){
$tag = lc($tag);
}
$doc_obj->add_utf8_metadata($section, $tag, $value);
}
# TITLE: extract the document title
if (exists $find_fields{'title'} && !$found_title) {
# we want a title, and didn't find one in the meta tags
# see if there's a tag
my $title;
my $from = ""; # for debugging output only
if ($html_header =~ /]*>([^<]+)<\/title[^>]*>/is) {
$title = $1;
$from = " tags";
}
if (!defined $title) {
$from = "first 100 chars";
# if no title use first 100 or so characters
$title = $$textref;
$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
$title =~ s/^.*?//si;
# ignore javascript!
$title =~ s@@ @sig;
$title =~ s/<\/([^>]+)><\1>//g; # (eg) - no space
$title =~ s/<[^>]*>/ /g; # remove all HTML tags
$title = substr ($title, 0, 100);
$title =~ s/\s\S*$/.../;
}
$title =~ s/<[^>]*>/ /g; # remove html tags
$title =~ s/ / /g;
$title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp...
$title =~ s/\s+/ /gs; # collapse multiple spaces
$title =~ s/^\s*//; # remove leading spaces
$title =~ s/\s*$//; # remove trailing spaces
$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
$title =~ s/^\s+//s; # in case title_sub introduced any...
$doc_obj->add_utf8_metadata ($section, 'Title', $title);
print $outhandle " extracted Title metadata \"$title\" from $from\n"
if ($self->{'verbosity'} > 2);
}
# add FileFormat metadata
$doc_obj->add_metadata($section,"FileFormat", "HTML");
# Special, for metadata names such as tagH1 - extracts
# the text between the first and
tags into "H1" metadata.
foreach my $field (keys %find_fields) {
if ($field !~ /^tag([a-z0-9]+)$/i) {next}
my $tag = $1;
if ($$textref =~ m@<$tag[^>]*>(.*?)$tag[^>]*>@g) {
my $content = $1;
$content =~ s/ / /g;
$content =~ s/<[^>]*>/ /g;
$content =~ s/^\s+//;
$content =~ s/\s+$//;
$content =~ s/\s+/ /gs;
if ($content) {
$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
$tag =~ s/^tag//i;
$doc_obj->add_utf8_metadata ($section, $tag, $content);
print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
if ($self->{'verbosity'} > 2);
}
}
}
}
sub extract_style {
my $self = shift (@_);
my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
my $outhandle = $self->{'outhandle'};
# find the header in the html file, which has the style info
$$textref =~ m@(.*?)@si;
my $html_header=$1;
my $style_contents = "";
# look for style tags
$html_header =~ /^/; # match the start of the string, for \G assertion
while ($html_header =~ m/\G.*?<(style|script|link)/sig) {
my $tag_name = $1;
if ($tag_name eq "style") {
if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {
$style_contents .= "\n