########################################################################### # # HTMLPlug.pm -- basic html plugin # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # # Note that this plugin handles frames only in a very simple way # i.e. each frame is treated as a separate document. This means # search results will contain links to individual frames rather # than linking to the top level frameset. # There may also be some problems caused by the _parent target # (it's removed by this plugin) # To use frames properly you'll need to use the WebPlug plugin. # package HTMLPlug; use BasPlug; use ghtml; use unicode; use util; use parsargv; sub BEGIN { @ISA = ('BasPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 'type' => "string", 'deft' => &get_default_process_exp() }, { 'name' => "block_exp", 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 'type' => 'string', 'deft' => &get_default_block_exp() }, { 'name' => "nolinks", 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).", 'type' => "flag" }, { 'name' => "keep_head", 'desc' => "Don't remove headers from html files.", 'type' => "flag" }, { 'name' => "no_metadata", 'desc' => "Don't attempt to extract any metadata from files.", 'type' => "flag" }, { 'name' => "metadata_fields", 'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag<tagname>' to have the contents of the first <tagname> pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.", 'type' => "metadata", 'deft' => "Title" }, { 'name' => "hunt_creator_metadata", 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.", 'type' => "flag" }, { 'name' => "file_is_url", 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.", 'type' => "flag" }, { 'name' => "assoc_files", 'desc' => "Perl regular expression of file extensions to associate with html documents.", 'type' => "string", 'deft' => q^(?i)\.(jpe?g|gif|png|css)$^ }, { 'name' => "rename_assoc_files", 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).", 'type' => "flag" }, { 'name' => "title_sub", 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.", 'type' => "string", 'deft' => "" }, { 'name' => "description_tags", 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.", 'type' => "flag" } ]; my $options = { 'name' => "HTMLPlug", 'desc' => "This plugin processes HTML files", 'inherits' => "yes", 'args' => $arguments }; # sub print_usage { # print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; # print STDERR " options:\n"; # print STDERR " -nolinks Don't make any attempt to trap links (setting this\n"; # print STDERR " flag may improve speed of building/importing but\n"; # print STDERR " any relative links within documents will be broken).\n"; # print STDERR " -keep_head Don't remove headers from html files.\n"; # print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; # print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to # extract. Defaults to 'Title'. # Use 'tag' to have the contents of the first # pair put in a metadata element called # 'tagname'. Capitalise this as you want the metadata # capitalised in Greenstone, since the tag extraction # is case insensitive.\n"; # print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and # place it in the 'Creator' field. Requires the # -metadata_fields flag.\n"; # print STDERR " -file_is_url Set if input filenames make up url of original source # documents e.g. if a web mirroring tool was used to # create the import directory structure\n"; # print STDERR " -assoc_files Perl regular expression of file extensions to # associate with html documents. # Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n"; # print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). # Also creates much shallower directory structure # (useful when creating collections to go on cd-rom).\n"; # print STDERR " -title_sub Substitution expression to modify string stored as # Title. Used by, for example, PDFPlug to remove # \"Page 1\", etc from text used as the title.\n"; # print STDERR " -description_tags Split document into sub-sections where
tags # occur. Note that by setting this option you # implicitly set -no_metadata, as all metadata should # be included within the
tags (this is only # true for documents that actually contain
tags # however). Also, '-keep_head' will have no effect when # this option is set, regardless of whether a document # contains Section tags.\n"; # } sub new { my $class = shift (@_); my $self = new BasPlug ($class, @_); # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); if (!parsargv::parse(\@_, q^nolinks^, \$self->{'nolinks'}, q^keep_head^, \$self->{'keep_head'}, q^no_metadata^, \$self->{'no_metadata'}, q^metadata_fields/.*/Title^, \$self->{'metadata_fields'}, q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'}, q^w3mir^, \$self->{'w3mir'}, q^file_is_url^, \$self->{'file_is_url'}, q^assoc_files/.*/(?i)\.(jpe?g|jpe|gif|png|css)$^, \$self->{'assoc_files'}, q^rename_assoc_files^, \$self->{'rename_assoc_files'}, q^title_sub/.*/^, \$self->{'title_sub'}, q^description_tags^, \$self->{'description_tags'}, "allow_extra_options")) { print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n"; $self->print_txt_usage(); die "\n"; } # retain this for backward compatibility (w3mir option was replaced by # file_is_url) if ($self->{'w3mir'}) { $self->{'file_is_url'} = 1; } $self->{'aux_files'} = {}; $self->{'dir_num'} = 0; $self->{'file_num'} = 0; return bless $self, $class; } sub get_default_block_exp { my $self = shift (@_); return q^(?i)\.(gif|jpe?g|jpe|png|css)$^; } sub get_default_process_exp { my $self = shift (@_); # the last option is an attempt to encode the concept of an html query ... return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^; } # do plugin specific processing of doc_obj sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; my $outhandle = $self->{'outhandle'}; print $outhandle "HTMLPlug: processing $file\n" if $self->{'verbosity'} > 1; if ($ENV{'GSDLOS'} =~ /^windows/i) { # this makes life so much easier... perl can cope with unix-style '/'s. $base_dir =~ s@(\\)+@/@g; $file =~ s@(\\)+@/@g; } # reset per-doc stuff... $self->{'aux_files'} = {}; $self->{'dir_num'} = 0; $self->{'file_num'} = 0; my $cursection = $doc_obj->get_top_section(); $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) unless $self->{'no_metadata'} || $self->{'description_tags'}; # Store URL for page as metadata - this can be used for an # altavista style search interface. The URL won't be valid # unless the file structure contains the domain name (i.e. # like when w3mir is used to download a website). my $web_url = "http://$file"; $doc_obj->add_metadata($cursection, "URL", $web_url); if ($self->{'description_tags'}) { # remove the html header - note that doing this here means any # sections defined within the header will be lost (so all
# tags must appear within the body of the HTML) $$textref =~ s/^.*?]*>//is; $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; my $opencom = '(?:|(?:—|—|--)>)'; my $lt = '(?:<|<)'; my $gt = '(?:>|>)'; my $quot = '(?:"|"|”|“)'; my $found_something = 0; my $top = 1; while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) { my $text = $1; my $comment = $2; if (defined $text) { # text before a comment - note that getting to here # doesn't necessarily mean there are Section tags in # the document $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection); } while ($comment =~ s/$lt(.*?)$gt//s) { my $tag = $1; if ($tag eq "Section") { $found_something = 1; $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top; $top = 0; } elsif ($tag eq "/Section") { $found_something = 1; $cursection = $doc_obj->get_parent_section ($cursection); } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) { my $metaname = $1; $comment =~ s/^(.*?)$lt\/Metadata$gt//s; my $metavalue = $1; $metavalue =~ s/^\s+//; $metavalue =~ s/\s+$//; # assume that no metadata value intentionally includes # carriage returns or HTML tags (if they're there they # were probably introduced when converting to HTML from # some other format). $metavalue =~ s/[\cJ\cM]/ /sg; $metavalue =~ s/<[^>]+>//sg; $metavalue =~ s/\s+/ /sg; $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue); } elsif ($tag eq "Description" || $tag eq "/Description") { # do nothing with containing Description tags } else { # simple HTML tag (probably created by the conversion # to HTML from some other format) - we'll ignore it and # hope for the best ;-) } } } if ($cursection ne "") { print $outhandle "HTMLPlug: WARNING: $file contains unmatched
tags\n"; } $$textref =~ s/^.*?]*>//is; $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; if ($$textref =~ /\S/) { if (!$found_something) { print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; print $outhandle " will be processed as a single section document\n"; # go ahead and process single-section document $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); # if document contains no Section tags we'll go ahead # and extract metadata (this won't have been done # above as the -description_tags option prevents it) $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection) unless $self->{'no_metadata'}; } else { print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n"; print $outhandle " of the final closing
tag. This text will\n"; print $outhandle " be ignored."; my ($text); if (length($$textref) > 30) { $text = substr($$textref, 0, 30) . "..."; } else { $text = $$textref; } $text =~ s/\n/ /isg; print $outhandle " ($text)\n"; } } elsif (!$found_something) { # may get to here if document contained no valid Section # tags but did contain some comments. The text will have # been processed already but we should print the warning # as above and extract metadata print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; print $outhandle " will be processed as a single section document\n"; $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection) unless $self->{'no_metadata'}; } } else { # remove header and footer if (!$self->{'keep_head'} || $self->{'description_tags'}) { $$textref =~ s/^.*?]*>//is; $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; } # single section document $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); } return 1; } # note that process_section may be called multiple times for a single # section (relying on the fact that add_utf8_text appends the text to any # that may exist already). sub process_section { my $self = shift (@_); my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_; # trap links if (!$self->{'nolinks'}) { # usemap="./#index" not handled correctly => change to "#index" $$textref =~ s/(]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ $self->replace_usemap_links($1, $2, $3)/isge; $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?\s*(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/ $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; } # trap images # allow spaces if inside quotes - jrm21 $$textref =~ s/(]*?src\s*=\s*)(\"[^\"]+\"|[^\s>]+)([^>]*>)/ $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; # add text to document object # turn \ into \\ so that the rest of greenstone doesn't think there # is an escape code following. (Macro parsing loses them...) $$textref =~ s/\\/\\\\/go; $doc_obj->add_utf8_text($cursection, $$textref); } sub replace_images { my $self = shift (@_); my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; # remove quotes from link at start and end if necessary if ($link=~/^\"/) { $link=~s/^\"//;$link=~s/\"$//; $front.='"'; $back="\"$back"; } $link =~ s/\n/ /g; my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back; } sub replace_href_links { my $self = shift (@_); my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; # attempt to sort out targets - frames are not handled # well in this plugin and some cases will screw things # up - e.g. the _parent target (so we'll just remove # them all ;-) $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is; $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is; $front =~ s/target=\"?_parent\"?//is; $back =~ s/target=\"?_parent\"?//is; return $front . $link . $back if $link =~ /^\#/s; $link =~ s/\n/ /g; my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); # href may use '\'s where '/'s should be on Windows $href =~ s/\\/\//g; my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/; ##### leave all these links alone (they won't be picked up by intermediate ##### pages). I think that's safest when dealing with frames, targets etc. ##### (at least until I think of a better way to do it). Problems occur with ##### mailto links from within small frames, the intermediate page is displayed ##### within that frame and can't be seen. There is still potential for this to ##### happen even with html pages - the solution seems to be to somehow tell ##### the browser from the server side to display the page being sent (i.e. ##### the intermediate page) in the top level window - I'm not sure if that's ##### possible - the following line should probably be deleted if that can be done return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is; if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) || ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) { &ghtml::urlsafe ($href); return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; } else { # link is to some other type of file (eg image) so we'll # need to associate that file return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back; } } sub add_file { my $self = shift (@_); my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_; my ($newname); my $filename = $href; $filename =~ s/^[^:]*:\/\///; $filename = &util::filename_cat($base_dir, $filename); # Replace %20's in URL with a space if required. Note that the filename # may include the %20 in some situations if ($filename =~ /\%20/) { if (!-e $filename) { $filename =~ s/\%20/ /g; } } my ($ext) = $filename =~ /(\.[^\.]*)$/; if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) { return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part; } if ($self->{'rename_assoc_files'}) { if (defined $self->{'aux_files'}->{$href}) { $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" . $self->{'aux_files'}->{$href}->{'file_num'} . $ext; } else { $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext; $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}}; $self->inc_filecount (); } $doc_obj->associate_file($filename, $newname, undef, $section); return "_httpdocimg_/$newname"; } else { ($newname) = $filename =~ /([^\/\\]*)$/; $doc_obj->associate_file($filename, $newname, undef, $section); return "_httpdocimg_/$newname"; } } sub format_link { my $self = shift (@_); my ($link, $base_dir, $file) = @_; my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/; $hash_part = "" if !defined $hash_part; if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) { my $outhandle = $self->{'outhandle'}; print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n" if $self->{'verbosity'}; return ($link, "", 0); } if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) { my $type = $1; if ($link =~ /^(http|ftp):/i) { # Turn url (using /) into file name (possibly using \ on windows) my @http_dir_split = split('/', $before_hash); $before_hash = &util::filename_cat(@http_dir_split); } $before_hash = $self->eval_dir_dots($before_hash); my $linkfilename = &util::filename_cat ($base_dir, $before_hash); my $rl = 0; $rl = 1 if (-e $linkfilename); # make sure there's a slash on the end if it's a directory if ($before_hash !~ /\/$/) { $before_hash .= "/" if (-d $linkfilename); } return ($type . $before_hash, $hash_part, $rl); } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i) { if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) { # the first directory will be the domain name if file_is_url # to generate archives, otherwise we'll assume all files are # from the same site and base_dir is the root if ($self->{'file_is_url'}) { my @dirs = split /[\/\\]/, $file; my $domname = shift (@dirs); $before_hash = &util::filename_cat($domname, $before_hash); $before_hash =~ s@\\@/@g; # for windows } else { # see if link shares directory with source document # => turn into relative link if this is so! if ($ENV{'GSDLOS'} =~ /^windows/i) { # too difficult doing a pattern match with embedded '\'s... my $win_before_hash=$before_hash; $win_before_hash =~ s@(\\)+@/@g; # $base_dir is already similarly "converted" on windows. if ($win_before_hash =~ s@^$base_dir/@@o) { # if this is true, we removed a prefix $before_hash=$win_before_hash; } } else { $before_hash = &util::filename_cat("",$before_hash); $before_hash =~ s@^$base_dir/@@; } } } else { # Turn relative file path into full path my $dirname = &File::Basename::dirname($file); $before_hash = &util::filename_cat($dirname, $before_hash); $before_hash = $self->eval_dir_dots($before_hash); } my $linkfilename = &util::filename_cat ($base_dir, $before_hash); # make sure there's a slash on the end if it's a directory if ($before_hash !~ /\/$/) { $before_hash .= "/" if (-d $linkfilename); } return ("http://" . $before_hash, $hash_part, 1); } else { # mailto, news, nntp, telnet, javascript or gopher link return ($before_hash, "", 0); } } sub extract_first_NNNN_characters { my $self = shift (@_); my ($textref, $doc_obj, $thissection) = @_; foreach my $size (split /,/, $self->{'first'}) { my $tmptext = $$textref; # skip to the body $tmptext =~ s/.*]*>//i; # remove javascript $tmptext =~ s@@ @sig; $tmptext =~ s/<[^>]*>/ /g; $tmptext =~ s/ / /g; $tmptext =~ s/^\s+//; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext = substr ($tmptext, 0, $size); $tmptext =~ s/\s\S*$/…/; # adds an ellipse (...) $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); } } sub extract_metadata { my $self = shift (@_); my ($textref, $metadata, $doc_obj, $section) = @_; my $outhandle = $self->{'outhandle'}; # if we don't want metadata, we may as well not be here ... return if (!defined $self->{'metadata_fields'}); # hunt for an author look in the metadata elements: if (defined $self->{'hunt_creator_metadata'}) { for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") { if ($$textref =~ /]*)/is) { my $content = $1 . $2; if ($content =~ /content\s*=\s*\"?(.*)\"?/is) { if (defined $1) { my $value = $1; $value =~ s/\"$//; $value =~ s/\s+/ /gs; $doc_obj->add_utf8_metadata($section, "Creator", $value); print $outhandle " extracted Creator metadata \"$value\"\n" if ($self->{'verbosity'} > 2); next; } } } } } foreach my $field (split /,/, $self->{'metadata_fields'}) { # don't need to extract field if it was passed in from a previous # (recursive) plugin next if defined $metadata->{$field}; # see if there's a tag for this field if ($$textref =~ /]*)/is) { my $content = $1 . $2; if ($content =~ /content\s*=\s*\"?(.*)\"?/is) { if (defined $1) { my $value = $1; $value =~ s/\"$//; $value =~ s/\s+/ /gs; $value =~ s/\".*//gs; $doc_obj->add_utf8_metadata($section, $field, $value); print $outhandle " extracted \"$field\" metadata \"$value\"\n" if ($self->{'verbosity'} > 2); next; } } } # TITLE: extract the document title if ($field =~ /^title$/i) { # see if there's a tag if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) { if (defined $1) { my $title = $1; if ($title =~ /\w/) { $title =~ s/<[^>]*>/ /g; $title =~ s/ / /g; $title =~ s/\s+/ /gs; $title =~ s/^\s+//; $title =~ s/\s+$//; $doc_obj->add_utf8_metadata ($section, $field, $title); print $outhandle " extracted \"$field\" metadata \"$title\"\n" if ($self->{'verbosity'} > 2); next; } } } # if no title use first 100 characters my $tmptext = $$textref; $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space $tmptext =~ s/<[^>]*>/ /g; $tmptext =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... $tmptext =~ s/^\s+//s; $tmptext =~ s/\s+$//; $tmptext =~ s/\s+/ /gs; $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); $tmptext =~ s/^\s+//s; # in case title_sub introduced any... $tmptext = substr ($tmptext, 0, 100); $tmptext =~ s/\s\S*$/.../; $doc_obj->add_utf8_metadata ($section, $field, $tmptext); print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n" if ($self->{'verbosity'} > 2); next; } # tag: extract the text between the first <H1> and </H1> tags if ($field =~ /^tag[a-z0-9]+$/i) { my $tag = $field; $tag =~ s/^tag//i; my $tmptext = $$textref; $tmptext =~ s/\s+/ /gs; if ($tmptext =~ /<$tag[^>]*>/i) { foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) { $word =~ s/ / /g; $word =~ s/<[^>]*>/ /g; $word =~ s/^\s+//; $word =~ s/\s+$//; $word =~ s/\s+/ /gs; if ($word ne "") { $doc_obj->add_utf8_metadata ($section, $tag, $word); print $outhandle " extracted \"$tag\" metadata \"$word\"\n" if ($self->{'verbosity'} > 2); } } } next; } } } # evaluate any "../" to next directory up # evaluate any "./" as here sub eval_dir_dots { my $self = shift (@_); my ($filename) = @_; my $dirsep_os = &util::get_os_dirsep(); my @dirsep = split(/$dirsep_os/,$filename); my @eval_dirs = (); foreach my $d (@dirsep) { if ($d eq "..") { pop(@eval_dirs); } elsif ($d eq ".") { # do nothing! } else { push(@eval_dirs,$d); } } return &util::filename_cat(@eval_dirs); } sub replace_usemap_links { my $self = shift (@_); my ($front, $link, $back) = @_; $link =~ s/^\.\///; return $front . $link . $back; } sub inc_filecount { my $self = shift (@_); if ($self->{'file_num'} == 1000) { $self->{'dir_num'} ++; $self->{'file_num'} = 0; } else { $self->{'file_num'} ++; } } # Extend the BasPlug read_file so that strings like é are # converted to UTF8 internally. # # We don't convert < or > or & or " in case # they interfere with the GML files sub read_file { my ($self, $filename, $encoding, $language, $textref) = @_; &BasPlug::read_file($self, $filename, $encoding, $language, $textref); # Convert entities to their UTF8 equivalents $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo; $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; } 1;