Changeset 13968


Ignore:
Timestamp:
2007-03-12T16:17:48+13:00 (17 years ago)
Author:
kjdon
Message:

Added a new option to HTMLPlug (tidy_html) - if set, will use HTMLTidy to tidy up the HTML to XHTML. This is needed if you want to use my book display stuff - Veronica.

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r12970 r13968  
    15911591
    15921592    if (-e $filename) {
    1593     $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
     1593        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
    15941594    $doc_obj->add_utf8_metadata($top_section, "hascover",  1);
    15951595    } else {
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r13241 r13968  
    4242use XMLParser;
    4343
     44use HTML::TokeParser::Simple;
     45use Image::Size;
     46
    4447sub BEGIN {
    4548    @HTMLPlug::ISA = ('BasPlug');
     
    6366      { 'name' => "keep_head",
    6467    'desc' => "{HTMLPlug.keep_head}",
    65     'type' => "flag" },
    66       { 'name' => "extract_style",
    67     'desc' => "{HTMLPlug.extract_style}",
    6868    'type' => "flag" },
    6969      { 'name' => "no_metadata",
     
    107107      { 'name' => "sectionalise_using_h_tags",
    108108    'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
    109     'type' => "flag" }
     109    'type' => "flag" },
     110      { 'name' => "tidy_html",
     111        'desc' => "{HTMLPlug.tidy_html}",
     112    'type' => "flag"},
    110113      ];
    111114
     
    116119        'args'     => $arguments };
    117120
     121# Will make the html input file as a proper XML file with removed font tag and
     122# image size added to the img tag.
     123# The tidying process takes place in a collection specific 'tmp' directory so
     124# that we don't accidentally damage the input.
     125sub tmp_tidy_file
     126{
     127    my $self = shift (@_);
     128    my ($file) = @_;
     129    my $input_filename = $file;
     130 
     131    if (-d $input_filename)
     132    {
     133        return $input_filename;
     134    }
     135   
     136    # get the input filename
     137    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     138    my $base_dirname = $dirname;
     139    $suffix = lc($suffix);   
     140   
     141    # derive tmp filename from input filename
     142    # Remove any white space from filename -- no risk of name collision, and
     143    # makes later conversion by utils simpler. Leave spaces in path...
     144    # tidy up the filename with space, dot, hyphen between
     145    $tailname =~ s/\s+//g;
     146    $tailname =~ s/\.+//g;
     147    $tailname =~ s/\-+//g;
     148    # convert to utf-8 otherwise we have problems with the doc.xml file
     149    # later on
     150    &unicode::ensure_utf8(\$tailname);
     151   
     152    # softlink to collection tmp dir
     153    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
     154    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
     155   
     156    # remove trailing slashes
     157    $dirname =~ s/[\\\/]+$//;
     158    # create folder for this file
     159    my $folderdirname = &File::Basename::basename($dirname);
     160    $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
     161    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
     162     
     163    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     164   
     165    # tidy the input file if it is a HTML-like file
     166    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
     167    {
     168        # create HTML parser to decode the input file
     169        my $parser = HTML::TokeParser::Simple->new($input_filename);
     170
     171        # write HTML tmp file without the font tag and image size are added to the img tag
     172        open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     173        while (my $token = $parser->get_token())
     174        {
     175        # is it an img tag
     176        if ($token->is_start_tag('img'))
     177        {
     178                # get the attributes
     179                my $attr = $token->return_attr;
     180
     181                # get the full path to the image
     182                my $img_file = &util::filename_cat($dirname,$attr->{src});
     183       
     184                # set the width and height attribute
     185                ($attr->{width}, $attr->{height}) = imgsize($img_file);
     186
     187                # recreate the tag
     188                print PROD "<img";
     189                print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
     190                print PROD ">";
     191        }
     192        # is it a font tag
     193        else
     194        {
     195            if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
     196            {
     197                # remove font tag
     198                print PROD "";
     199            }
     200            else
     201            {
     202                # print without changes
     203                print PROD $token->as_is;
     204            }
     205        }
     206        }
     207        close (PROD) || die("Error Closing File: $tmp_filename $!");
     208
     209        # run html-tidy on the tmp file to make it a proper XML file
     210        my $tidyfile = `tidy -wrap 0 -asxml $tmp_filename`;
     211
     212        # write result back to the tmp file
     213        open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     214        print PROD $tidyfile;
     215        close (PROD) || die("Error Closing File: $tmp_filename $!");
     216   
     217    # just for checking copy all other file from the base dir to tmp dir if it is not exists
     218    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
     219    my @files = grep {!/^\.+$/} readdir(DIR);
     220    close(DIR);
     221
     222    foreach my $file (@files)
     223    {
     224        my $src_file = &util::filename_cat($base_dirname,$file);
     225        my $dest_file = &util::filename_cat($tmp_dirname,$file);
     226        if ((!-e $dest_file) && (!-d $src_file))
     227        {
     228            # just copy the original file back to the tmp directory
     229                open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
     230                open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
     231                print TIDYOUT <TIDYIN>;
     232                close TIDYIN;
     233                close TIDYOUT;
     234        }
     235    }
     236    }
     237    else
     238    {
     239        if (!-e $tmp_filename)
     240    {
     241            # just copy the original file back to the tmp directory
     242            open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
     243            open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
     244            print TIDYOUT <TIDYIN>;
     245            close TIDYIN;
     246            close TIDYOUT;
     247    }
     248    }
     249   
     250    # return the output filename
     251    return $tmp_filename;
     252}
     253
     254sub read_into_doc_obj
     255{
     256    my $self = shift (@_); 
     257    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     258   
     259    # get the input file
     260    my $input_filename = $file;
     261    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     262    $suffix = lc($suffix);   
     263   
     264    if ($self->{'tidy_html'})
     265    {
     266        # tidy the input file if it is a HTML-like file
     267        #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
     268        #{
     269        # set the file to be tidied
     270            $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
     271       
     272            # get the tidied file
     273            my $tidy_filename = $self->tmp_tidy_file($input_filename);
     274       
     275            # derive tmp filename from input filename
     276            my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
     277   
     278        # set the new input file and base_dir to be from the tidied file
     279        $file = "$tailname$suffix";
     280        $base_dir = $dirname;
     281    #}
     282    }
     283   
     284    # call the parent read_into_doc_obj
     285    my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
     286   
     287    return ($process_status,$doc_obj);
     288}
     289
    118290sub new {
    119291    my ($class) = shift (@_);
     
    124296    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    125297   
    126     my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
     298
     299    my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
    127300   
    128301    if ($self->{'w3mir'}) {
     
    148321   
    149322    # the last option is an attempt to encode the concept of an html query ...
    150     return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+[\?\@].+=.*)$^;
     323    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
    151324}
    152325
     
    174347    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    175348    my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
    176     my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    177     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     349
     350    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
    178351
    179352    # remove quotes from link at start and end if necessary
     
    227400    #--></hX>
    228401    if ($self->{'sectionalise_using_h_tags'}) {
    229     # description_tags should always be activated because we convert headings to description tags
     402    # description_tags should allways be activated because we convert headings to description tags
    230403    $self->{'description_tags'} = 1;
    231404
     
    245418
    246419    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    247     unless $self->{'no_metadata'};
    248 
    249     # extract style info as DocumentHeader metadata
    250     $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file)
    251     if ($self->{'extract_style'} == 1);
     420    unless $self->{'no_metadata'} || $self->{'description_tags'};
    252421
    253422    # Store URL for page as metadata - this can be used for an
     
    355524        $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
    356525
     526        # if document contains no Section tags we'll go ahead
     527        # and extract metadata (this won't have been done
     528        # above as the -description_tags option prevents it)
     529        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
     530        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
     531            unless $self->{'no_metadata'};
     532
    357533        } else {
    358534        print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
     
    379555        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    380556        }
    381     }
    382     } # if $self->{'description_tags'}
    383     else {
     557
     558        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
     559        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
     560        unless $self->{'no_metadata'};
     561    }
     562   
     563    } else {
     564
    384565    # remove header and footer
    385     if (!$self->{'keep_head'}) {
     566    if (!$self->{'keep_head'} || $self->{'description_tags'}) {
    386567        $$textref =~ s/^.*?<body[^>]*>//is;
    387568        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     
    509690    my $anchor_name = $img_file;
    510691    $anchor_name =~ s/^.*\///;
    511     $anchor_name = "<a name=\"$anchor_name\"></a>";
     692    $anchor_name = "<a name=\"$anchor_name\" />";
    512693
    513694    return $front . $img_file . $back . $anchor_name;
     
    738919    my $outhandle = $self->{'outhandle'};
    739920    # if we don't want metadata, we may as well not be here ...
    740     return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
     921    return if (!defined $self->{'metadata_fields'});
    741922
    742923    # metadata fields to extract/save. 'key' is the (lowercase) name of the
     
    758939    }
    759940
    760     if ($self->{'hunt_creator_metadata'} == 1 ) {
     941    if (defined $self->{'hunt_creator_metadata'} &&
     942    $self->{'hunt_creator_metadata'} == 1 ) {
    761943    my @extra_fields =
    762944        (
     
    9131095
    9141096
    915 sub extract_style {
    916     my $self = shift (@_);
    917     my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
    918     my $outhandle = $self->{'outhandle'};
    919    
    920     # find the header in the html file, which has the style info
    921     $$textref =~ m@<head>(.*?)</head>@si;
    922 
    923     my $html_header=$1;
    924     my $style_contents = "";
    925    
    926     # look for style tags
    927     $html_header =~ /^/; # match the start of the string, for \G assertion
    928     while ($html_header =~ m/\G.*?<(style|script|link)/sig) {
    929     my $tag_name = $1;
    930     if ($tag_name eq "style") {
    931         if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {
    932         $style_contents .= "\n<style";
    933         $style_contents .= $1;
    934         }
    935     }
    936     elsif ($tag_name eq "link") {
    937         $style_contents .= "\n<link";
    938         $html_header =~ m/\G(.*?>)/is;
    939         $style_contents .= $1;
    940     }
    941     elsif ($tag_name eq "script") {
    942         # bit more tricky cos it may or may not have content
    943         if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) {
    944         $style_contents .= "\n<script";
    945         $style_contents .= $1;
    946         } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) {
    947         $style_contents .= "\n<script";
    948         $style_contents .= $1;
    949         }
    950     }
    951     }
    952 
    953     # now we need to do something with any links found in the style thing
    954     $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    955         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;
    956 
    957     $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);
    958    
    959 }
    960 
    9611097# evaluate any "../" to next directory up
    9621098# evaluate any "./" as here
  • trunk/gsdl/perllib/strings.properties

    r13901 r13968  
    817817HTMLPlug.title_sub:Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove "Page 1", etc from text used as the title.
    818818
     819HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format.
     820
    819821ImagePlug.converttotype:Convert main image to format 's'.
    820822
Note: See TracChangeset for help on using the changeset viewer.