Context Navigation

← Previous Changeset
Next Changeset →

Changeset 7202

Timestamp:

2004-04-15T10:57:04+12:00 (20 years ago)

Author:

jrm21

Message:

rewrote the <meta> tag handling to be more robust and more efficient.

File:

: 1 edited

trunk/gsdl/perllib/plugins/HTMLPlug.pm (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r6812
+              r7202
     @ISA = ('BasPlug');
+}
+use strict; # every perl program should have this!
+no strict 'refs'; # make an exception so we can use variables as filehandles
 my $arguments =
 …
         'args'     => $arguments };
-#  sub print_usage {
-#      print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
-#      print STDERR "  options:\n";
-#      print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
-#      print STDERR "                          flag may improve speed of building/importing but\n";
-#      print STDERR "                          any relative links within documents will be broken).\n";
-#      print STDERR "   -keep_head             Don't remove headers from html files.\n";
-#      print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
-#      print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
-#                            extract. Defaults to 'Title'.
-#                            Use 'tag<tagname>' to have the contents of the first
-#                            <tagname> pair put in a metadata element called
-#                            'tagname'. Capitalise this as you want the metadata
-#                            capitalised in Greenstone, since the tag extraction
-#                            is case insensitive.\n";
-#      print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
-#                            place it in the 'Creator' field. Requires the
-#                            -metadata_fields flag.\n";
-#      print STDERR "   -file_is_url           Set if input filenames make up url of original source
-#                            documents e.g. if a web mirroring tool was used to
-#                            create the import directory structure\n";
-#      print STDERR "   -assoc_files           Perl regular expression of file extensions to
-#                            associate with html documents.
-#                            Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
-#      print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
-#                            Also creates much shallower directory structure
-#                            (useful when creating collections to go on cd-rom).\n";
-#      print STDERR "   -title_sub             Substitution expression to modify string stored as
-#                            Title. Used by, for example, PDFPlug to remove
-#                            \"Page 1\", etc from text used as the title.\n";
-#      print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
-#                            occur. Note that by setting this option you
-#                            implicitly set -no_metadata, as all metadata should
-#                            be included within the <Section> tags (this is only
-#                            true for documents that actually contain <Section> tags
-#                            however). Also, '-keep_head' will have no effect when
-#                            this option is set, regardless of whether a document
-#                            contains Section tags.\n";
-#  }
 sub new {
     my $class = shift (@_);
 …
+}
 sub extract_metadata {
     my $self = shift (@_);
 …
     return if (!defined $self->{'metadata_fields'});
+    # hunt for an author look in the metadata elements:
+    if (defined $self->{'hunt_creator_metadata'}) {
+    for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
+        #if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
+        if ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
+        my $content = $1 . $2;
+        if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
+            if (defined $1) {
+            my $value = $1;
+            $value =~ s/\"$//;
+            $value =~ s/\s+/ /gs;
+            $doc_obj->add_utf8_metadata($section, "Creator", $value);
+            print $outhandle " extracted Creator metadata \"$value\"\n"
+                if ($self->{'verbosity'} > 2);
+            next;
+            }
+        }
+    my %find_fields = (); # metadata fields to extract/save
+    my %creator_fields = (); # short-cut for lookups
+    foreach my $field (split /,/, $self->{'metadata_fields'}) {
+    $find_fields{lc($field)}=$field; # lc = lowercase
+    }
+    if (defined $self->{'hunt_creator_metadata'} &&
+    $self->{'hunt_creator_metadata'} == 1 ) {
+    my @extra_fields =
+        (
+         'author',
+         'author.email',
+         'creator',
+         'dc.creator',
+         'dc.creator.corporatename',
+         );
+    # add the creator_metadata fields to search for
+    foreach my $field (@extra_fields) {
+        $creator_fields{$field}=0; # add to lookup hash
+    }
+    }
+    # find the header in the html file, which has the meta tags
+    $$textref =~ m@<head>(.*?)</head>@si;
+    my $html_header=$1;
+    # go through every <meta... tag defined in the html and see if it is
+    # one of the tags we want to match.
+    # this assumes that ">" won't appear. (I don't think it's allowed to...)
+    $html_header =~ /^/; # match the start of the string, for \G assertion
+    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
+    my $metatag=$1;
+    my ($tag, $value);
+    # find the tag name
+    $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
+    $tag=$2;
+    # in case they're not using " or ', but they should...
+    if (! $tag) {
+        $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is;
+        $tag=$1;
+    }
+    if (!defined $tag) {
+        print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
+        next;
+    }
+    # don't need to assign this field if it was passed in from a previous
+    # (recursive) plugin
+    if (defined $metadata->{$tag}) {next}
+    # find the tag content
+    $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
+    $value=$2;
+    if (! $value) {
+        $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is;
+        $value=$1;
+    }
+    if (!defined $value) {
+        print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
+        next;
+    }
+    # clean up and add
+    $value =~ s/\s+/ /gs;
+    if (exists $creator_fields{lc($tag)}) {
+        # map this value onto greenstone's "Creator" metadata
+        $tag='Creator';
+    } elsif (!exists $find_fields{lc($tag)}) {
+        next; # don't want this tag
+    } else {
+        # get the user's preferred capitalisation
+        $tag = $find_fields{lc($tag)};
+    }
+    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
+        if ($self->{'verbosity'} > 2);
+    $doc_obj->add_utf8_metadata($section, $tag, $value);
+    }
+    # TITLE: extract the document title
+    if (exists $find_fields{'title'} && $find_fields{'title'} == 0) {
+    # we want a title, and didn't find one in the meta tags
+    # see if there's a <title> tag
+    my $title;
+    if ($html_header =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
+        $title = $1;
+    }
+    if (!defined $title) {
+        # if no title use first 100 or so characters
+        $title = $$textref;
+        $title =~ s/^.*?<body>//si;
+        # ignore javascript!
+        $title =~ s@<script.*?</script>@ @sig;
+        $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
+        $title =~ s/<[^>]*>/ /g; # remove all HTML tags
+        $title = substr ($title, 0, 100);
+        $title =~ s/\s\S*$/.../;
+    }
+    $title =~ s/<[^>]*>/ /g; # remove html tags
+    $title =~ s/&nbsp;/ /g;
+    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
+    $title =~ s/\s+/ /gs; # collapse multiple spaces
+    $title =~ s/^\s*//;   # remove leading spaces
+    $title =~ s/\s*$//;   # remove trailing spaces
+    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
+    $title =~ s/^\s+//s; # in case title_sub introduced any...
+    $doc_obj->add_utf8_metadata ($section, 'Title', $title);
+    print $outhandle " extracted Title metadata \"$title\"\n"
+        if ($self->{'verbosity'} > 2);
+    }
+    # Special, for metadata names such as tagH1 - extracts
+    # the text between the first <H1> and </H1> tags into "H1" metadata.
+    foreach my $field (keys %find_fields) {
+    if ($field !~ /^tag([a-z0-9]+)$/i) {next}
+    my $tag = $1;
+    if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
+        my $content = $1;
+        $content =~ s/&nbsp;/ /g;
+        $content =~ s/<[^>]*>/ /g;
+        $content =~ s/^\s+//;
+        $content =~ s/\s+$//;
+        $content =~ s/\s+/ /gs;
+        if ($content) {
+        $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
+        $tag =~ s/^tag//i;
+        $doc_obj->add_utf8_metadata ($section, $tag, $content);
+        print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
+            if ($self->{'verbosity'} > 2);
+        }
+    }
+    }
+    foreach my $field (split /,/, $self->{'metadata_fields'}) {
+    my $found = 0;
+    # don't need to extract field if it was passed in from a previous
+    # (recursive) plugin
+    next if defined $metadata->{$field};
+    # see if there's a <meta> tag for this field
+    #while ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
+    while ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
+        my $content = $1 . $2;
+        if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
+        if (defined $1) {
+            my $value = $1;
+            $value =~ s/\"$//;
+            $value =~ s/\s+/ /gs;
+            $value =~ s/\".*//gs;
+            $doc_obj->add_utf8_metadata($section, $field, $value);
+            print $outhandle " extracted \"$field\" metadata \"$value\"\n"
+            if ($self->{'verbosity'} > 2);
+            $found = 1;
+        }
+        }
+    }
+    next if $found;
+    # TITLE: extract the document title
+    if ($field =~ /^title$/i) {
+        # see if there's a <title> tag
+        if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
+        if (defined $1) {
+            my $title = $1;
+            # Arg. This allows only ascii value characters in titles
+            if ($title =~ /\w/) {
+            $title =~ s/<[^>]*>/ /g;
+            $title =~ s/&nbsp;/ /g;
+            $title =~ s/\s+/ /gs;
+            $title =~ s/^\s+//;
+            $title =~ s/\s+$//;
+            $doc_obj->add_utf8_metadata ($section, $field, $title);
+            print $outhandle " extracted \"$field\" metadata \"$title\"\n"
+                if ($self->{'verbosity'} > 2);
+            next;
+            }
+        }
+        }
+        # if no title use first 100 characters
+        my $tmptext = $$textref;
+        $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
+        $tmptext =~ s/<[^>]*>/ /g;
+        $tmptext =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
+        $tmptext =~ s/^\s+//s;
+        $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s+/ /gs;
+        $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
+        $tmptext =~ s/^\s+//s; # in case title_sub introduced any...
+        $tmptext = substr ($tmptext, 0, 100);
+        $tmptext =~ s/\s\S*$/.../;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
+        if ($self->{'verbosity'} > 2);
+        next;
+    }
+        # tag: extract the text between the first <H1> and </H1> tags
+        if ($field =~ /^tag[a-z0-9]+$/i) {
+        my $tag = $field;
+        $tag =~ s/^tag//i;
+            my $tmptext = $$textref;
+            $tmptext =~ s/\s+/ /gs;
+            if ($tmptext =~ /<$tag[^>]*>/i) {
+        foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) {
+            $word =~ s/&nbsp;/ /g;
+            $word =~ s/<[^>]*>/ /g;
+            $word =~ s/^\s+//;
+            $word =~ s/\s+$//;
+            $word =~ s/\s+/ /gs;
+            if ($word ne "") {
+            $doc_obj->add_utf8_metadata ($section, $tag, $word);
+            print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
+                if ($self->{'verbosity'} > 2);
+            }
+        }
+            }
+            next;
+        }
+    }
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7202

Legend:

trunk/gsdl/perllib/plugins/HTMLPlug.pm

Download in other formats: