Changeset 12947


Ignore:
Timestamp:
2006-09-29T15:38:44+12:00 (18 years ago)
Author:
kjdon
Message:

added new -extract_style option to HTMLPlug. looks for style, script and link tags in the html head tag, and saves them as ex.DocumentHeader metadata. -metadata_fields can now be used with -description_tags - why shouldn't we have metadata in the header as well as in the description tags?? can always turn head metadata off using -no_metadata. -hunt_creator_metadata no longer needs -metadata_fields option to be set.

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r12883 r12947  
    6363      { 'name' => "keep_head",
    6464    'desc' => "{HTMLPlug.keep_head}",
     65    'type' => "flag" },
     66      { 'name' => "extract_style",
     67    'desc' => "{HTMLPlug.extract_style}",
    6568    'type' => "flag" },
    6669      { 'name' => "no_metadata",
     
    171174    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    172175    my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
    173 
    174     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
     176    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     177    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
    175178
    176179    # remove quotes from link at start and end if necessary
     
    242245
    243246    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    244     unless $self->{'no_metadata'} || $self->{'description_tags'};
     247    unless $self->{'no_metadata'};
     248
     249    # extract style info as DocumentHeader metadata
     250    $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file)
     251    if ($self->{'extract_style'} == 1);
    245252
    246253    # Store URL for page as metadata - this can be used for an
     
    348355        $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
    349356
    350         # if document contains no Section tags we'll go ahead
    351         # and extract metadata (this won't have been done
    352         # above as the -description_tags option prevents it)
    353         my $complete_text = $head_keep.$doc_obj->get_text($cursection);
    354         $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
    355             unless $self->{'no_metadata'};
    356 
    357357        } else {
    358358        print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
     
    379379        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    380380        }
    381 
    382         my $complete_text = $head_keep.$doc_obj->get_text($cursection);
    383         $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
    384         unless $self->{'no_metadata'};
    385     }
    386     }
     381    }
     382    } # if $self->{'description_tags'}
    387383    else {
    388384    # remove header and footer
     
    742738    my $outhandle = $self->{'outhandle'};
    743739    # if we don't want metadata, we may as well not be here ...
    744     return if (!defined $self->{'metadata_fields'});
     740    return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
    745741
    746742    # metadata fields to extract/save. 'key' is the (lowercase) name of the
     
    762758    }
    763759
    764     if (defined $self->{'hunt_creator_metadata'} &&
    765     $self->{'hunt_creator_metadata'} == 1 ) {
     760    if ($self->{'hunt_creator_metadata'} == 1 ) {
    766761    my @extra_fields =
    767762        (
     
    918913
    919914
     915sub extract_style {
     916    my $self = shift (@_);
     917    my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
     918    my $outhandle = $self->{'outhandle'};
     919   
     920    # find the header in the html file, which has the style info
     921    $$textref =~ m@<head>(.*?)</head>@si;
     922
     923    my $html_header=$1;
     924    my $style_contents = "";
     925   
     926    # look for style tags
     927    $html_header =~ /^/; # match the start of the string, for \G assertion
     928    while ($html_header =~ m/\G.*?<(style|script|link)/sig) {
     929    my $tag_name = $1;
     930    if ($tag_name eq "style") {
     931        if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {
     932        $style_contents .= "\n<style";
     933        $style_contents .= $1;
     934        }
     935    }
     936    elsif ($tag_name eq "link") {
     937        $style_contents .= "\n<link";
     938        $html_header =~ m/\G(.*?>)/is;
     939        $style_contents .= $1;
     940    }
     941    elsif ($tag_name eq "script") {
     942        # bit more tricky cos it may or may not have content
     943        if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) {
     944        $style_contents .= "\n<script";
     945        $style_contents .= $1;
     946        } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) {
     947        $style_contents .= "\n<script";
     948        $style_contents .= $1;
     949        }
     950    }
     951    }
     952
     953    # now we need to do something with any links found in the style thing
     954    $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     955        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;
     956
     957    $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);
     958   
     959}
     960
    920961# evaluate any "../" to next directory up
    921962# evaluate any "./" as here
  • trunk/gsdl/perllib/strings.properties

    r12817 r12947  
    778778HTMLPlug.desc:This plugin processes HTML files
    779779
    780 HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.
     780HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. '-keep_head' will have no effect when this option is set.
     781
     782HTMLPlug.extract_style:Extract style and script information from the HTML <head> tag and save as DocumentHeader metadata. This will be set in the document page as the _document:documentheader_ macro.
    781783
    782784HTMLPlug.file_is_url:Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.
    783785
    784 HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.
     786HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field.
    785787
    786788HTMLPlug.keep_head:Don't remove headers from html files.
Note: See TracChangeset for help on using the changeset viewer.