Changeset 7202


Ignore:
Timestamp:
2004-04-15T10:57:04+12:00 (20 years ago)
Author:
jrm21
Message:

rewrote the <meta> tag handling to be more robust and more efficient.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r6812 r7202  
    4747    @ISA = ('BasPlug');
    4848}
     49
     50use strict; # every perl program should have this!
     51no strict 'refs'; # make an exception so we can use variables as filehandles
    4952
    5053my $arguments =
     
    97100        'args'     => $arguments };
    98101
    99 
    100 #  sub print_usage {
    101 #      print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    102 #      print STDERR "  options:\n";
    103 #      print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
    104 #      print STDERR "                          flag may improve speed of building/importing but\n";
    105 #      print STDERR "                          any relative links within documents will be broken).\n";
    106 #      print STDERR "   -keep_head             Don't remove headers from html files.\n";
    107 #      print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
    108 #      print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
    109 #                            extract. Defaults to 'Title'.
    110 #                            Use 'tag<tagname>' to have the contents of the first
    111 #                            <tagname> pair put in a metadata element called
    112 #                            'tagname'. Capitalise this as you want the metadata
    113 #                            capitalised in Greenstone, since the tag extraction
    114 #                            is case insensitive.\n";
    115 #      print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
    116 #                            place it in the 'Creator' field. Requires the
    117 #                            -metadata_fields flag.\n";
    118 #      print STDERR "   -file_is_url           Set if input filenames make up url of original source
    119 #                            documents e.g. if a web mirroring tool was used to
    120 #                            create the import directory structure\n";
    121 #      print STDERR "   -assoc_files           Perl regular expression of file extensions to
    122 #                            associate with html documents.
    123 #                            Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
    124 #      print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
    125 #                            Also creates much shallower directory structure
    126 #                            (useful when creating collections to go on cd-rom).\n";
    127 #      print STDERR "   -title_sub             Substitution expression to modify string stored as
    128 #                            Title. Used by, for example, PDFPlug to remove
    129 #                            \"Page 1\", etc from text used as the title.\n";
    130 #      print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
    131 #                            occur. Note that by setting this option you
    132 #                            implicitly set -no_metadata, as all metadata should
    133 #                            be included within the <Section> tags (this is only
    134 #                            true for documents that actually contain <Section> tags
    135 #                            however). Also, '-keep_head' will have no effect when
    136 #                            this option is set, regardless of whether a document
    137 #                            contains Section tags.\n";
    138 #  }
    139 
    140102sub new {
    141103    my $class = shift (@_);
     
    600562}
    601563
     564
    602565sub extract_metadata {
    603566    my $self = shift (@_);
     
    607570    return if (!defined $self->{'metadata_fields'});
    608571
    609     # hunt for an author look in the metadata elements:
    610     if (defined $self->{'hunt_creator_metadata'}) {
    611     for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
    612         #if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
    613         if ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
    614         my $content = $1 . $2;
    615         if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
    616             if (defined $1) {
    617             my $value = $1;
    618             $value =~ s/\"$//;
    619             $value =~ s/\s+/ /gs;
    620             $doc_obj->add_utf8_metadata($section, "Creator", $value);
    621             print $outhandle " extracted Creator metadata \"$value\"\n"
    622                 if ($self->{'verbosity'} > 2);
    623             next;
    624             }
    625         }
     572    my %find_fields = (); # metadata fields to extract/save
     573
     574    my %creator_fields = (); # short-cut for lookups
     575
     576
     577    foreach my $field (split /,/, $self->{'metadata_fields'}) {
     578    $find_fields{lc($field)}=$field; # lc = lowercase
     579    }
     580
     581    if (defined $self->{'hunt_creator_metadata'} &&
     582    $self->{'hunt_creator_metadata'} == 1 ) {
     583    my @extra_fields =
     584        (
     585         'author',
     586         'author.email',
     587         'creator',
     588         'dc.creator',
     589         'dc.creator.corporatename',
     590         );
     591
     592    # add the creator_metadata fields to search for
     593    foreach my $field (@extra_fields) {
     594        $creator_fields{$field}=0; # add to lookup hash
     595    }
     596    }
     597
     598    # find the header in the html file, which has the meta tags
     599    $$textref =~ m@<head>(.*?)</head>@si;
     600
     601    my $html_header=$1;
     602
     603    # go through every <meta... tag defined in the html and see if it is
     604    # one of the tags we want to match.
     605
     606    # this assumes that ">" won't appear. (I don't think it's allowed to...)
     607    $html_header =~ /^/; # match the start of the string, for \G assertion
     608    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
     609    my $metatag=$1;
     610    my ($tag, $value);
     611
     612    # find the tag name
     613    $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
     614    $tag=$2;
     615    # in case they're not using " or ', but they should...
     616    if (! $tag) {
     617        $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is;
     618        $tag=$1;
     619    }
     620
     621    if (!defined $tag) {
     622        print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
     623        next;
     624    }
     625
     626    # don't need to assign this field if it was passed in from a previous
     627    # (recursive) plugin
     628    if (defined $metadata->{$tag}) {next}
     629
     630    # find the tag content
     631    $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
     632    $value=$2;
     633    if (! $value) {
     634        $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is;
     635        $value=$1;
     636    }
     637    if (!defined $value) {
     638        print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
     639        next;
     640    }
     641
     642    # clean up and add
     643    $value =~ s/\s+/ /gs;
     644    if (exists $creator_fields{lc($tag)}) {
     645        # map this value onto greenstone's "Creator" metadata
     646        $tag='Creator';
     647    } elsif (!exists $find_fields{lc($tag)}) {
     648        next; # don't want this tag
     649    } else {
     650        # get the user's preferred capitalisation
     651        $tag = $find_fields{lc($tag)};
     652    }
     653    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
     654        if ($self->{'verbosity'} > 2);
     655    $doc_obj->add_utf8_metadata($section, $tag, $value);
     656
     657    }
     658   
     659    # TITLE: extract the document title
     660    if (exists $find_fields{'title'} && $find_fields{'title'} == 0) {
     661    # we want a title, and didn't find one in the meta tags
     662    # see if there's a <title> tag
     663    my $title;
     664    if ($html_header =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
     665        $title = $1;
     666    }
     667    if (!defined $title) {
     668        # if no title use first 100 or so characters
     669        $title = $$textref;
     670        $title =~ s/^.*?<body>//si;
     671        # ignore javascript!
     672        $title =~ s@<script.*?</script>@ @sig;
     673        $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
     674        $title =~ s/<[^>]*>/ /g; # remove all HTML tags
     675        $title = substr ($title, 0, 100);
     676        $title =~ s/\s\S*$/.../;
     677    }
     678    $title =~ s/<[^>]*>/ /g; # remove html tags
     679    $title =~ s/&nbsp;/ /g;
     680    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
     681    $title =~ s/\s+/ /gs; # collapse multiple spaces
     682    $title =~ s/^\s*//;   # remove leading spaces
     683    $title =~ s/\s*$//;   # remove trailing spaces
     684    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
     685    $title =~ s/^\s+//s; # in case title_sub introduced any...
     686    $doc_obj->add_utf8_metadata ($section, 'Title', $title);
     687    print $outhandle " extracted Title metadata \"$title\"\n"
     688        if ($self->{'verbosity'} > 2);
     689    }
     690
     691    # Special, for metadata names such as tagH1 - extracts
     692    # the text between the first <H1> and </H1> tags into "H1" metadata.
     693
     694    foreach my $field (keys %find_fields) {
     695    if ($field !~ /^tag([a-z0-9]+)$/i) {next}
     696    my $tag = $1;
     697    if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
     698        my $content = $1;
     699        $content =~ s/&nbsp;/ /g;
     700        $content =~ s/<[^>]*>/ /g;
     701        $content =~ s/^\s+//;
     702        $content =~ s/\s+$//;
     703        $content =~ s/\s+/ /gs;
     704        if ($content) {
     705        $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
     706        $tag =~ s/^tag//i;
     707        $doc_obj->add_utf8_metadata ($section, $tag, $content);
     708        print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
     709            if ($self->{'verbosity'} > 2);
    626710        }
    627711    }
    628     }
    629    
    630     foreach my $field (split /,/, $self->{'metadata_fields'}) {
    631     my $found = 0;
    632     # don't need to extract field if it was passed in from a previous
    633     # (recursive) plugin
    634     next if defined $metadata->{$field};
    635 
    636     # see if there's a <meta> tag for this field
    637     #while ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
    638     while ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) {
    639         my $content = $1 . $2;
    640         if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
    641         if (defined $1) {
    642             my $value = $1;
    643             $value =~ s/\"$//;
    644             $value =~ s/\s+/ /gs;
    645             $value =~ s/\".*//gs;
    646             $doc_obj->add_utf8_metadata($section, $field, $value);
    647             print $outhandle " extracted \"$field\" metadata \"$value\"\n"
    648             if ($self->{'verbosity'} > 2);
    649             $found = 1;
    650         }
    651         }
    652     }
    653     next if $found;
    654     # TITLE: extract the document title
    655    
    656     if ($field =~ /^title$/i) {
    657 
    658         # see if there's a <title> tag
    659         if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
    660         if (defined $1) {
    661             my $title = $1;
    662             # Arg. This allows only ascii value characters in titles
    663             if ($title =~ /\w/) {
    664             $title =~ s/<[^>]*>/ /g;
    665             $title =~ s/&nbsp;/ /g;
    666             $title =~ s/\s+/ /gs;
    667             $title =~ s/^\s+//;
    668             $title =~ s/\s+$//;
    669             $doc_obj->add_utf8_metadata ($section, $field, $title);
    670             print $outhandle " extracted \"$field\" metadata \"$title\"\n"
    671                 if ($self->{'verbosity'} > 2);
    672             next;
    673             }
    674         }
    675         }
    676        
    677         # if no title use first 100 characters
    678         my $tmptext = $$textref;
    679         $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
    680         $tmptext =~ s/<[^>]*>/ /g;
    681         $tmptext =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
    682         $tmptext =~ s/^\s+//s;
    683         $tmptext =~ s/\s+$//;
    684         $tmptext =~ s/\s+/ /gs;
    685         $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
    686         $tmptext =~ s/^\s+//s; # in case title_sub introduced any...
    687         $tmptext = substr ($tmptext, 0, 100);
    688         $tmptext =~ s/\s\S*$/.../;
    689         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
    690         print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
    691         if ($self->{'verbosity'} > 2);
    692         next;
    693     }
    694 
    695         # tag: extract the text between the first <H1> and </H1> tags
    696         if ($field =~ /^tag[a-z0-9]+$/i) {
    697        
    698         my $tag = $field;
    699         $tag =~ s/^tag//i;
    700             my $tmptext = $$textref;
    701             $tmptext =~ s/\s+/ /gs;
    702             if ($tmptext =~ /<$tag[^>]*>/i) {       
    703         foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) {
    704             $word =~ s/&nbsp;/ /g;
    705             $word =~ s/<[^>]*>/ /g;
    706             $word =~ s/^\s+//;
    707             $word =~ s/\s+$//;
    708             $word =~ s/\s+/ /gs;
    709             if ($word ne "") {
    710             $doc_obj->add_utf8_metadata ($section, $tag, $word);
    711             print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
    712                 if ($self->{'verbosity'} > 2);
    713             }
    714         }
    715             }
    716             next;
    717         }   
    718     }
     712    }   
    719713}
    720714
Note: See TracChangeset for help on using the changeset viewer.