Changeset 12947
- Timestamp:
- 2006-09-29T15:38:44+12:00 (18 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r12883 r12947 63 63 { 'name' => "keep_head", 64 64 'desc' => "{HTMLPlug.keep_head}", 65 'type' => "flag" }, 66 { 'name' => "extract_style", 67 'desc' => "{HTMLPlug.extract_style}", 65 68 'type' => "flag" }, 66 69 { 'name' => "no_metadata", … … 171 174 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs); 172 175 my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs); 173 174 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches ) {176 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 177 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 175 178 176 179 # remove quotes from link at start and end if necessary … … 242 245 243 246 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 244 unless $self->{'no_metadata'} || $self->{'description_tags'}; 247 unless $self->{'no_metadata'}; 248 249 # extract style info as DocumentHeader metadata 250 $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file) 251 if ($self->{'extract_style'} == 1); 245 252 246 253 # Store URL for page as metadata - this can be used for an … … 348 355 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 349 356 350 # if document contains no Section tags we'll go ahead351 # and extract metadata (this won't have been done352 # above as the -description_tags option prevents it)353 my $complete_text = $head_keep.$doc_obj->get_text($cursection);354 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)355 unless $self->{'no_metadata'};356 357 357 } else { 358 358 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n"; … … 379 379 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 380 380 } 381 382 my $complete_text = $head_keep.$doc_obj->get_text($cursection); 383 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection) 384 unless $self->{'no_metadata'}; 385 } 386 } 381 } 382 } # if $self->{'description_tags'} 387 383 else { 388 384 # remove header and footer … … 742 738 my $outhandle = $self->{'outhandle'}; 743 739 # if we don't want metadata, we may as well not be here ... 744 return if (!defined $self->{'metadata_fields'} );740 return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0); 745 741 746 742 # metadata fields to extract/save. 'key' is the (lowercase) name of the … … 762 758 } 763 759 764 if (defined $self->{'hunt_creator_metadata'} && 765 $self->{'hunt_creator_metadata'} == 1 ) { 760 if ($self->{'hunt_creator_metadata'} == 1 ) { 766 761 my @extra_fields = 767 762 ( … … 918 913 919 914 915 sub extract_style { 916 my $self = shift (@_); 917 my ($textref, $doc_obj, $section, $base_dir, $file) = @_; 918 my $outhandle = $self->{'outhandle'}; 919 920 # find the header in the html file, which has the style info 921 $$textref =~ m@<head>(.*?)</head>@si; 922 923 my $html_header=$1; 924 my $style_contents = ""; 925 926 # look for style tags 927 $html_header =~ /^/; # match the start of the string, for \G assertion 928 while ($html_header =~ m/\G.*?<(style|script|link)/sig) { 929 my $tag_name = $1; 930 if ($tag_name eq "style") { 931 if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) { 932 $style_contents .= "\n<style"; 933 $style_contents .= $1; 934 } 935 } 936 elsif ($tag_name eq "link") { 937 $style_contents .= "\n<link"; 938 $html_header =~ m/\G(.*?>)/is; 939 $style_contents .= $1; 940 } 941 elsif ($tag_name eq "script") { 942 # bit more tricky cos it may or may not have content 943 if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) { 944 $style_contents .= "\n<script"; 945 $style_contents .= $1; 946 } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) { 947 $style_contents .= "\n<script"; 948 $style_contents .= $1; 949 } 950 } 951 } 952 953 # now we need to do something with any links found in the style thing 954 $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 955 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge; 956 957 $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents); 958 959 } 960 920 961 # evaluate any "../" to next directory up 921 962 # evaluate any "./" as here -
trunk/gsdl/perllib/strings.properties
r12817 r12947 778 778 HTMLPlug.desc:This plugin processes HTML files 779 779 780 HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set. 780 HTMLPlug.description_tags:Split document into sub-sections where <Section> tags occur. '-keep_head' will have no effect when this option is set. 781 782 HTMLPlug.extract_style:Extract style and script information from the HTML <head> tag and save as DocumentHeader metadata. This will be set in the document page as the _document:documentheader_ macro. 781 783 782 784 HTMLPlug.file_is_url:Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure. 783 785 784 HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.786 HTMLPlug.hunt_creator_metadata:Find as much metadata as possible on authorship and place it in the 'Creator' field. 785 787 786 788 HTMLPlug.keep_head:Don't remove headers from html files.
Note:
See TracChangeset
for help on using the changeset viewer.