Context Navigation

← Previous Changeset
Next Changeset →

Changeset 12947

Timestamp:

2006-09-29T15:38:44+12:00 (18 years ago)

Author:

kjdon

Message:

added new -extract_style option to HTMLPlug. looks for style, script and link tags in the html head tag, and saves them as ex.DocumentHeader metadata. -metadata_fields can now be used with -description_tags - why shouldn't we have metadata in the header as well as in the description tags?? can always turn head metadata off using -no_metadata. -hunt_creator_metadata no longer needs -metadata_fields option to be set.

Location:

trunk/gsdl/perllib

Files:

: 2 edited

plugins/HTMLPlug.pm (modified) (8 diffs)
strings.properties (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r12883
+              r12947
       { 'name' => "keep_head",
     'desc' => "{HTMLPlug.keep_head}",
+    'type' => "flag" },
+      { 'name' => "extract_style",
+    'desc' => "{HTMLPlug.extract_style}",
     'type' => "flag" },
       { 'name' => "no_metadata",
 …
     my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
+    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     # remove quotes from link at start and end if necessary
 …
     $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
+    unless $self->{'no_metadata'} || $self->{'description_tags'};
+    unless $self->{'no_metadata'};
+    # extract style info as DocumentHeader metadata
+    $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file)
+    if ($self->{'extract_style'} == 1);
     # Store URL for page as metadata - this can be used for an
 …
         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
-        # if document contains no Section tags we'll go ahead
-        # and extract metadata (this won't have been done
-        # above as the -description_tags option prevents it)
-        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
-        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
-            unless $self->{'no_metadata'};
         } else {
         print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
 …
         print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+        }
+        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
+        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
+        unless $self->{'no_metadata'};
+    }
+    }
+    }
+    } # if $self->{'description_tags'}
     else {
     # remove header and footer
 …
     my $outhandle = $self->{'outhandle'};
     # if we don't want metadata, we may as well not be here ...
     return if (!defined $self->{'metadata_fields'});
+    return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
     # metadata fields to extract/save. 'key' is the (lowercase) name of the
 …
+    }
+    if (defined $self->{'hunt_creator_metadata'} &&
+    $self->{'hunt_creator_metadata'} == 1 ) {
+    if ($self->{'hunt_creator_metadata'} == 1 ) {
     my @extra_fields =
+        (
 …
+sub extract_style {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
+    my $outhandle = $self->{'outhandle'};
+    # find the header in the html file, which has the style info
+    $$textref =~ m@<head>(.*?)</head>@si;
+    my $html_header=$1;
+    my $style_contents = "";
+    # look for style tags
+    $html_header =~ /^/; # match the start of the string, for \G assertion
+    while ($html_header =~ m/\G.*?<(style|script|link)/sig) {
+    my $tag_name = $1;
+    if ($tag_name eq "style") {
+        if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {
+        $style_contents .= "\n<style";
+        $style_contents .= $1;
+        }
+    }
+    elsif ($tag_name eq "link") {
+        $style_contents .= "\n<link";
+        $html_header =~ m/\G(.*?>)/is;
+        $style_contents .= $1;
+    }
+    elsif ($tag_name eq "script") {
+        # bit more tricky cos it may or may not have content
+        if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) {
+        $style_contents .= "\n<script";
+        $style_contents .= $1;
+        } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) {
+        $style_contents .= "\n<script";
+        $style_contents .= $1;
+        }
+    }
+    }
+    # now we need to do something with any links found in the style thing
+    $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;
+    $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);
+}
 # evaluate any "../" to next directory up
 # evaluate any "./" as here

trunk/gsdl/perllib/strings.properties

-              r12817
+              r12947
 HTMLPlug.desc:This plugin processes HTML files
+HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.
+HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. '-keep_head' will have no effect when this option is set.
+HTMLPlug.extract_style:Extract style and script information from the HTML <head> tag and save as DocumentHeader metadata. This will be set in the document page as the _document:documentheader_ macro.
 HTMLPlug.file_is_url:Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.
 HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.
+HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field.
 HTMLPlug.keep_head:Don't remove headers from html files.

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12947

Legend:

trunk/gsdl/perllib/plugins/HTMLPlug.pm

trunk/gsdl/perllib/strings.properties

Download in other formats: