Context Navigation

← Previous Changeset
Next Changeset →

Changeset 8509

Timestamp:

2004-11-11T13:53:21+13:00 (19 years ago)

Author:

chi

Message:

Add new methods (with a smart_block option) to store the blocked associated image files and stylesheet files. This option will allow to read in all the associated images and stylesheet files in the first pass. Also, modification of sub process{} to handle better the file without a section tag.

File:

: 1 edited

trunk/gsdl/perllib/plugins/HTMLPlug.pm (modified) (31 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r8366
+              r8509
 use util;
 use parsargv;
+use XMLParser;
 sub BEGIN {
 …
     $self->{'dir_num'} = 0;
     $self->{'file_num'} = 0;
     return bless $self, $class;
+}
 …
     my $self = shift (@_);
     return q^(?i)\.(gif|jpe?g|jpe|png|css|js)$^;
+    return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
+}
 …
+}
+sub metadata_read {
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
+    my $outhandle = $self->{'outhandle'};
+    my $filename = $file;
+    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
+    my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
+    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    return undef; # can't recognise
+    }
+    # Do encoding stuff
+    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
+    # read in file ($text will be in utf8)
+    my $text = "";
+    $self->read_file ($filename, $encoding, $language, \$text);
+    $self->store_block_files (\$text, $filename);
+    return 1;
+}
+sub store_block_files
+{
+    my $self =shift (@_);
+    my ($textref, $filename) = @_;
+    my $html_fname = $filename;
+    my @file_blocks;
+    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
+    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
+    $$textref =~ s/$opencom(.*?)$closecom//gs;
+    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
+    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
+    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
+    foreach my $link (@img_matches, @usemap_matches, @link_matches) {
+    # remove quotes from link at start and end if necessary
+    if ($link=~/^\"/) {
+        $link=~s/^\"//;
+        $link=~s/\"$//;
+    }
+    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
+    if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
+        # Turn relative file path into full path
+        my $dirname = &File::Basename::dirname($filename);
+        $link = &util::filename_cat($dirname, $link);
+    }
+    $link = $self->eval_dir_dots($link);
+    $self->{'file_blocks'}->{$link} = 1;
+    }
+}
 # do plugin specific processing of doc_obj
 sub process {
 …
     my $cursection = $doc_obj->get_top_section();
     $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
+    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
     unless $self->{'no_metadata'} || $self->{'description_tags'};
 …
     if ($self->{'description_tags'}) {
     # remove the html header - note that doing this here means any
     # sections defined within the header will be lost (so all <Section>
     # tags must appear within the body of the HTML)
+    my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
     $$textref =~ s/^.*?<body[^>]*>//is;
     $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
 …
     my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     my $lt = '(?:<|&lt;)';
     my $gt = '(?:>|&gt;)';
 …
+        }
         while ($comment =~ s/$lt(.*?)$gt//s) {
         my $tag = $1;
         if ($tag eq "Section") {
 …
     if ($$textref =~ /\S/) {
         if (!$found_something) {
+        print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
+        print $outhandle "          will be processed as a single section document\n";
+        if ($self->{'verbosity'} > 2) {
+            print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
+            print $outhandle "          will be processed as a single section document\n";
+        }
         # go ahead and process single-section document
         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
 …
         # and extract metadata (this won't have been done
         # above as the -description_tags option prevents it)
+        $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
+        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
+        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
             unless $self->{'no_metadata'};
 …
         print $outhandle "          of the final closing </Section> tag. This text will\n";
         print $outhandle "          be ignored.";
         my ($text);
         if (length($$textref) > 30) {
 …
     } elsif (!$found_something) {
+        # may get to here if document contained no valid Section
+        # tags but did contain some comments. The text will have
+        # been processed already but we should print the warning
+        # as above and extract metadata
+        print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
+        print $outhandle "          will be processed as a single section document\n";
+        $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
+        if ($self->{'verbosity'} > 2) {
+        # may get to here if document contained no valid Section
+        # tags but did contain some comments. The text will have
+        # been processed already but we should print the warning
+        # as above and extract metadata
+        print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
+        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+        }
+        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
+        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
         unless $self->{'no_metadata'};
+    }
 …
     my ($front, $link, $back, $base_dir,
     $file, $doc_obj, $section) = @_;
     # remove quotes from link at start and end if necessary
     if ($link=~/^\"/) {
 …
     my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
     my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
     my $anchor_name = $img_file;
 …
     my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
     ##### leave all these links alone (they won't be picked up by intermediate
     ##### pages). I think that's safest when dealing with frames, targets etc.
 …
     &ghtml::urlsafe ($href);
     return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
     } else {
     # link is to some other type of file (eg image) so we'll
 …
     return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
+    }
     if ($self->{'rename_assoc_files'}) {
     if (defined $self->{'aux_files'}->{$href}) {
 …
     $doc_obj->associate_file($filename, $newname, undef, $section);
     return "_httpdocimg_/$newname";
     } else {
     ($newname) = $filename =~ /([^\/\\]*)$/;
 …
     my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
     $hash_part = "" if !defined $hash_part;
     if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
 …
     return ($link, "", 0);
+    }
     if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
     my $type = $1;
 …
     my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
     my $rl = 0;
     $rl = 1 if (-e $linkfilename);
 …
               $before_hash=$win_before_hash;
+            }
+        }
         else {
 …
             $before_hash =~ s@^$base_dir/@@;
+        }
+        }
     } else {
         # Turn relative file path into full path
 …
     my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
     # make sure there's a slash on the end if it's a directory
     if ($before_hash !~ /\/$/) {
 …
     return ("http://" . $before_hash, $hash_part, 1);
     } else {
     # mailto, news, nntp, telnet, javascript or gopher link
 …
+    }
     # find the header in the html file, which has the meta tags
     $$textref =~ m@<head>(.*?)</head>@si;
     my $html_header=$1;
     # go through every <meta... tag defined in the html and see if it is
     # one of the tags we want to match.
 …
     # this assumes that ">" won't appear. (I don't think it's allowed to...)
     $html_header =~ /^/; # match the start of the string, for \G assertion
     while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
     my $metatag=$1;
 …
         $from = "<title> tags";
+    }
     if (!defined $title) {
         $from = "first 100 chars";
 …
     my $self = shift (@_);
     my ($filename) = @_;
     my $dirsep_os = &util::get_os_dirsep();
     my @dirsep = split(/$dirsep_os/,$filename);
 …
     if ($d eq "..") {
         pop(@eval_dirs);
     } elsif ($d eq ".") {
         # do nothing!
 …
+    }
+    # Need to fiddle with number of elements in @eval_dirs if the
+    # first one is the empty string.  This is because of a
+    # modification to util::filename_cat that supresses the addition
+    # of a leading '/' character (or \ if windows) (intended to help
+    # filename cat with relative paths) if the first entry in the
+    # array is the empty string.  Making the array start with *two*
+    # empty strings is a way to defeat this "smart" option.
+    #
+    if (scalar(@eval_dirs) > 0) {
+    if ($eval_dirs[0] eq ""){
+        unshift(@eval_dirs,"");
+    }
+    }
     return &util::filename_cat(@eval_dirs);
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 8509

Legend:

trunk/gsdl/perllib/plugins/HTMLPlug.pm

Download in other formats: