tags if found otherwise first 100 characters # outside of tags. # will also attempt to include images which it will search for in # the same directory as the document itself (it will also search # directories relative to that directory). # this plugin currently does nothing with href links so relative links # may become broken. package HTMLPlug; use BasPlug; use sorttools; use util; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; my $filename = &util::filename_cat($base_dir, $file); my $absdir = $filename; $absdir =~ s/[^\/\\]*$//; return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename)); my $gz = 0; if (defined $2) { $gz = $2; $gz = 1 if ($gz =~ /\.gz/i); } print STDERR "HTMLPlug: processing $filename\n" if $processor->{'verbosity'}; # create a new document my $doc_obj = new doc ($file, "indexed_doc"); if ($gz) { open (FILE, "zcat $filename |") || die "HTMLPlug::read - zcat can't open $filename\n"; } else { open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n"; } my $cursection = $doc_obj->get_top_section(); my $text = ""; my $line = ""; my $donehead = 0; my $title = ""; while (defined ($line = <FILE>)) { $text .= $line; } # remove line breaks $text =~ s/\s+/ /g; # see if there's a <title> tag my $foundtitle = 0; if ($text =~ /<title[^>]*>([^<]*)<\/title[^>]*>/i) { if (defined $1) { my $title = $1; if ($title =~ /\w/) { $doc_obj->add_metadata ($cursection, "Title", $title); $foundtitle = 1; } } } # if no title use first 100 characters if (!$foundtitle) { my $tmptext = $text; $tmptext =~ s/<[^>]*>//g; my $title = substr ($tmptext, 0, 100); $doc_obj->add_metadata ($cursection, "Title", $title); } # remove header rubbish $text =~ s/^.*?<body[^>]*>//i; # fix up the image links $text =~ s/(<img[^>]*?src=\"?)([^\">]+)(\"?[^>]*>)/ &replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige; # add a newline at the beginning of each paragraph $text =~ s/(.)\s*<p\b/$1\n\n<p/gi; # add a newline every 80 characters at a word boundary # Note: this regular expression puts a line feed before # the last word in each section, even when it is not # needed. $text =~ s/(.{1,80})\s/$1\n/g; $doc_obj->add_text ($cursection, $text); foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($cursection, $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } # add OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj); return 1; # processed the file } sub replace_image_links { my ($dir, $doc_obj, $front, $link, $back) = @_; my ($filename, $error); my $foundimage = 0; $link =~ s/\/\///; my ($imagetype) = $link =~ /([^\.]*)$/; $imagetype =~ tr/[A-Z]/[a-z]/; if ($imagetype eq "jpg") {$imagetype = "jpeg";} if ($imagetype !~ /^(jpg|gif|png)$/) { print STDERR "HTMLPlug: Warning - unknown image type ($imagetype)\n"; } my ($imagefile) = $link =~ /([^\/]*)$/; my ($imagepath) = $link =~ /^[^\/]*(.*)$/; if (defined $imagepath && $imagepath =~ /\w/) { # relative link $filename = &util::filename_cat ($dir, $imagepath); if (-e $filename) { $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); $foundimage = 1; } else { $error = "HTMLPlug: Warning - couldn't find image file $imagefile in either $filename or"; } } if (!$foundimage) { $filename = &util::filename_cat ($dir, $imagefile); if (-e $filename) { $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); $foundimage = 1; } elsif (defined $error) { print STDERR "$error $filename\n"; } else { print STDERR "HTMLPlug: Warning - couldn't find image file $imagefile in $filename\n"; } } if ($foundimage) { return "${front}_httpcollection_/archives/_thisOID_/${imagefile}${back}"; } else { return ""; } } 1;

########################################################################### # # HTMLPlug.pm -- basic html plugin # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # creates simple single-level document from .htm or .html files # (case-insensitive match on filenames). Adds Title metadata # taken from tags if found otherwise first 100 characters # outside of tags. # will also attempt to include images which it will search for in # the same directory as the document itself (it will also search # directories relative to that directory). # this plugin currently does nothing with href links so relative links # may become broken. package HTMLPlug; use BasPlug; use sorttools; use util; sub BEGIN { @ISA = ('BasPlug'); } sub new { my ($class) = @_; $self = new BasPlug (); return bless $self, $class; } sub is_recursive { my $self = shift (@_); return 0; # this is not a recursive plugin } # return number of files processed, undef if can't process # Note that $base_dir might be "" and that $file might # include directories sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; my $filename = &util::filename_cat($base_dir, $file); my $absdir = $filename; $absdir =~ s/[^\/\\]*$//; return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename)); my $gz = 0; if (defined $2) { $gz = $2; $gz = 1 if ($gz =~ /\.gz/i); } print STDERR "HTMLPlug: processing $filename\n" if $processor->{'verbosity'}; # create a new document my $doc_obj = new doc ($file, "indexed_doc"); if ($gz) { open (FILE, "zcat $filename |") || die "HTMLPlug::read - zcat can't open $filename\n"; } else { open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n"; } my $cursection = $doc_obj->get_top_section(); my $text = ""; my $line = ""; my $donehead = 0; my $title = ""; while (defined ($line = <FILE>)) { $text .= $line; } # remove line breaks $text =~ s/\s+/ /g; # see if there's a <title> tag my $foundtitle = 0; if ($text =~ /<title[^>]*>([^<]*)<\/title[^>]*>/i) { if (defined $1) { my $title = $1; if ($title =~ /\w/) { $doc_obj->add_metadata ($cursection, "Title", $title); $foundtitle = 1; } } } # if no title use first 100 characters if (!$foundtitle) { my $tmptext = $text; $tmptext =~ s/<[^>]*>//g; my $title = substr ($tmptext, 0, 100); $doc_obj->add_metadata ($cursection, "Title", $title); } # remove header rubbish $text =~ s/^.*?<body[^>]*>//i; # fix up the image links $text =~ s/(<img[^>]*?src=\"?)([^\">]+)(\"?[^>]*>)/ &replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige; # add a newline at the beginning of each paragraph $text =~ s/(.)\s*<p\b/$1\n\n<p/gi; # add a newline every 80 characters at a word boundary # Note: this regular expression puts a line feed before # the last word in each section, even when it is not # needed. $text =~ s/(.{1,80})\s/$1\n/g; $doc_obj->add_text ($cursection, $text); foreach $field (keys(%$metadata)) { # $metadata->{$field} may be an array reference if (ref ($metadata->{$field}) eq "ARRAY") { map { $doc_obj->add_metadata ($cursection, $field, $_); } @{$metadata->{$field}}; } else { $doc_obj->add_metadata ($cursection, $field, $metadata->{$field}); } } # add OID $doc_obj->set_OID (); # process the document $processor->process($doc_obj); return 1; # processed the file } sub replace_image_links { my ($dir, $doc_obj, $front, $link, $back) = @_; my ($filename, $error); my $foundimage = 0; $link =~ s/\/\///; my ($imagetype) = $link =~ /([^\.]*)$/; $imagetype =~ tr/[A-Z]/[a-z]/; if ($imagetype eq "jpg") {$imagetype = "jpeg";} if ($imagetype !~ /^(jpg|gif|png)$/) { print STDERR "HTMLPlug: Warning - unknown image type ($imagetype)\n"; } my ($imagefile) = $link =~ /([^\/]*)$/; my ($imagepath) = $link =~ /^[^\/]*(.*)$/; if (defined $imagepath && $imagepath =~ /\w/) { # relative link $filename = &util::filename_cat ($dir, $imagepath); if (-e $filename) { $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); $foundimage = 1; } else { $error = "HTMLPlug: Warning - couldn't find image file $imagefile in either $filename or"; } } if (!$foundimage) { $filename = &util::filename_cat ($dir, $imagefile); if (-e $filename) { $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); $foundimage = 1; } elsif (defined $error) { print STDERR "$error $filename\n"; } else { print STDERR "HTMLPlug: Warning - couldn't find image file $imagefile in $filename\n"; } } if ($foundimage) { return "${front}_httpcollection_/archives/_thisOID_/${imagefile}${back}"; } else { return ""; } } 1;