Context Navigation

← Previous Changeset
Next Changeset →

Changeset 13968

Timestamp:

2007-03-12T16:17:48+13:00 (17 years ago)

Author:

kjdon

Message:

Added a new option to HTMLPlug (tidy_html) - if set, will use HTMLTidy to tidy up the HTML to XHTML. This is needed if you want to use my book display stuff - Veronica.

Location:

trunk/gsdl/perllib

Files:

: 3 edited

plugins/BasPlug.pm (modified) (1 diff)
plugins/HTMLPlug.pm (modified) (15 diffs)
strings.properties (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

r12970	r13968
1591	1591
1592	1592	if (-e $filename) {
1593		$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
	1593	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1594	1594	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1595	1595	} else {

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r13241
+              r13968
 use XMLParser;
+use HTML::TokeParser::Simple;
+use Image::Size;
 sub BEGIN {
     @HTMLPlug::ISA = ('BasPlug');
 …
       { 'name' => "keep_head",
     'desc' => "{HTMLPlug.keep_head}",
-    'type' => "flag" },
-      { 'name' => "extract_style",
-    'desc' => "{HTMLPlug.extract_style}",
     'type' => "flag" },
       { 'name' => "no_metadata",
 …
       { 'name' => "sectionalise_using_h_tags",
     'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
+    'type' => "flag" }
+    'type' => "flag" },
+      { 'name' => "tidy_html",
+        'desc' => "{HTMLPlug.tidy_html}",
+    'type' => "flag"},
       ];
 …
         'args'     => $arguments };
+# Will make the html input file as a proper XML file with removed font tag and
+# image size added to the img tag.
+# The tidying process takes place in a collection specific 'tmp' directory so
+# that we don't accidentally damage the input.
+sub tmp_tidy_file
+{
+    my $self = shift (@_);
+    my ($file) = @_;
+    my $input_filename = $file;
+    if (-d $input_filename)
+    {
+        return $input_filename;
+    }
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    my $base_dirname = $dirname;
+    $suffix = lc($suffix);
+    # derive tmp filename from input filename
+    # Remove any white space from filename -- no risk of name collision, and
+    # makes later conversion by utils simpler. Leave spaces in path...
+    # tidy up the filename with space, dot, hyphen between
+    $tailname =~ s/\s+//g;
+    $tailname =~ s/\.+//g;
+    $tailname =~ s/\-+//g;
+    # convert to utf-8 otherwise we have problems with the doc.xml file
+    # later on
+    &unicode::ensure_utf8(\$tailname);
+    # softlink to collection tmp dir
+    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
+    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+    # remove trailing slashes
+    $dirname =~ s/[\\\/]+$//;
+    # create folder for this file
+    my $folderdirname = &File::Basename::basename($dirname);
+    $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
+    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+    # tidy the input file if it is a HTML-like file
+    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
+    {
+        # create HTML parser to decode the input file
+        my $parser = HTML::TokeParser::Simple->new($input_filename);
+        # write HTML tmp file without the font tag and image size are added to the img tag
+        open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+        while (my $token = $parser->get_token())
+        {
+        # is it an img tag
+        if ($token->is_start_tag('img'))
+        {
+                # get the attributes
+                my $attr = $token->return_attr;
+                # get the full path to the image
+                my $img_file = &util::filename_cat($dirname,$attr->{src});
+                # set the width and height attribute
+                ($attr->{width}, $attr->{height}) = imgsize($img_file);
+                # recreate the tag
+                print PROD "<img";
+                print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
+                print PROD ">";
+        }
+        # is it a font tag
+        else
+        {
+            if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
+            {
+                # remove font tag
+                print PROD "";
+            }
+            else
+            {
+                # print without changes
+                print PROD $token->as_is;
+            }
+        }
+        }
+        close (PROD) || die("Error Closing File: $tmp_filename $!");
+        # run html-tidy on the tmp file to make it a proper XML file
+        my $tidyfile = `tidy -wrap 0 -asxml $tmp_filename`;
+        # write result back to the tmp file
+        open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+        print PROD $tidyfile;
+        close (PROD) || die("Error Closing File: $tmp_filename $!");
+    # just for checking copy all other file from the base dir to tmp dir if it is not exists
+    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
+    my @files = grep {!/^\.+$/} readdir(DIR);
+    close(DIR);
+    foreach my $file (@files)
+    {
+        my $src_file = &util::filename_cat($base_dirname,$file);
+        my $dest_file = &util::filename_cat($tmp_dirname,$file);
+        if ((!-e $dest_file) && (!-d $src_file))
+        {
+            # just copy the original file back to the tmp directory
+                open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
+                open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
+                print TIDYOUT <TIDYIN>;
+                close TIDYIN;
+                close TIDYOUT;
+        }
+    }
+    }
+    else
+    {
+        if (!-e $tmp_filename)
+    {
+            # just copy the original file back to the tmp directory
+            open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
+            open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
+            print TIDYOUT <TIDYIN>;
+            close TIDYIN;
+            close TIDYOUT;
+    }
+    }
+    # return the output filename
+    return $tmp_filename;
+}
+sub read_into_doc_obj
+{
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
+    # get the input file
+    my $input_filename = $file;
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    $suffix = lc($suffix);
+    if ($self->{'tidy_html'})
+    {
+        # tidy the input file if it is a HTML-like file
+        #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
+        #{
+        # set the file to be tidied
+            $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
+            # get the tidied file
+            my $tidy_filename = $self->tmp_tidy_file($input_filename);
+            # derive tmp filename from input filename
+            my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
+        # set the new input file and base_dir to be from the tidied file
+        $file = "$tailname$suffix";
+        $base_dir = $dirname;
+    #}
+    }
+    # call the parent read_into_doc_obj
+    my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli);
+    return ($process_status,$doc_obj);
+}
 sub new {
     my ($class) = shift (@_);
 …
     if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
+    my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
+    my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
     if ($self->{'w3mir'}) {
 …
     # the last option is an attempt to encode the concept of an html query ...
     return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+[\?\@].+=.*)$^;
+    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
+}
 …
     my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
+    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
+    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) {
     # remove quotes from link at start and end if necessary
 …
     #--></hX>
     if ($self->{'sectionalise_using_h_tags'}) {
     # description_tags should always be activated because we convert headings to description tags
+    # description_tags should allways be activated because we convert headings to description tags
     $self->{'description_tags'} = 1;
 …
     $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
+    unless $self->{'no_metadata'};
+    # extract style info as DocumentHeader metadata
+    $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file)
+    if ($self->{'extract_style'} == 1);
+    unless $self->{'no_metadata'} || $self->{'description_tags'};
     # Store URL for page as metadata - this can be used for an
 …
         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+        # if document contains no Section tags we'll go ahead
+        # and extract metadata (this won't have been done
+        # above as the -description_tags option prevents it)
+        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
+        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
+            unless $self->{'no_metadata'};
         } else {
         print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
 …
         print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+        }
+    }
+    } # if $self->{'description_tags'}
+    else {
+        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
+        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
+        unless $self->{'no_metadata'};
+    }
+    } else {
     # remove header and footer
     if (!$self->{'keep_head'}) {
+    if (!$self->{'keep_head'} || $self->{'description_tags'}) {
         $$textref =~ s/^.*?<body[^>]*>//is;
         $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
 …
     my $anchor_name = $img_file;
     $anchor_name =~ s/^.*\///;
     $anchor_name = "<a name=\"$anchor_name\"></a>";
+    $anchor_name = "<a name=\"$anchor_name\" />";
     return $front . $img_file . $back . $anchor_name;
 …
     my $outhandle = $self->{'outhandle'};
     # if we don't want metadata, we may as well not be here ...
     return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
+    return if (!defined $self->{'metadata_fields'});
     # metadata fields to extract/save. 'key' is the (lowercase) name of the
 …
+    }
+    if ($self->{'hunt_creator_metadata'} == 1 ) {
+    if (defined $self->{'hunt_creator_metadata'} &&
+    $self->{'hunt_creator_metadata'} == 1 ) {
     my @extra_fields =
+        (
 …
-sub extract_style {
-    my $self = shift (@_);
-    my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
-    my $outhandle = $self->{'outhandle'};
-    # find the header in the html file, which has the style info
-    $$textref =~ m@<head>(.*?)</head>@si;
-    my $html_header=$1;
-    my $style_contents = "";
-    # look for style tags
-    $html_header =~ /^/; # match the start of the string, for \G assertion
-    while ($html_header =~ m/\G.*?<(style|script|link)/sig) {
-    my $tag_name = $1;
-    if ($tag_name eq "style") {
-        if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {
-        $style_contents .= "\n<style";
-        $style_contents .= $1;
+        }
+    }
-    elsif ($tag_name eq "link") {
-        $style_contents .= "\n<link";
-        $html_header =~ m/\G(.*?>)/is;
-        $style_contents .= $1;
+    }
-    elsif ($tag_name eq "script") {
-        # bit more tricky cos it may or may not have content
-        if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) {
-        $style_contents .= "\n<script";
-        $style_contents .= $1;
-        } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) {
-        $style_contents .= "\n<script";
-        $style_contents .= $1;
+        }
+    }
+    }
-    # now we need to do something with any links found in the style thing
-    $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
-        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;
-    $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);
+}
 # evaluate any "../" to next directory up
 # evaluate any "./" as here

trunk/gsdl/perllib/strings.properties

r13901	r13968
817	817	HTMLPlug.title_sub:Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove "Page 1", etc from text used as the title.
818	818
	819	HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format.
	820
819	821	ImagePlug.converttotype:Convert main image to format 's'.
820	822

Note: See TracChangeset for help on using the changeset viewer.