Changeset 8509


Ignore:
Timestamp:
2004-11-11T13:53:21+13:00 (19 years ago)
Author:
chi
Message:

Add new methods (with a smart_block option) to store the blocked associated image files and stylesheet files. This option will allow to read in all the associated images and stylesheet files in the first pass. Also, modification of sub process{} to handle better the file without a section tag.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r8366 r8509  
    4242use util;
    4343use parsargv;
     44use XMLParser;
    4445
    4546sub BEGIN {
     
    135136    $self->{'dir_num'} = 0;
    136137    $self->{'file_num'} = 0;
     138
    137139    return bless $self, $class;
    138140}
     
    143145    my $self = shift (@_);
    144146
    145     return q^(?i)\.(gif|jpe?g|jpe|png|css|js)$^;
     147    return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
    146148}
    147149
     
    153155}
    154156
    155 
     157sub metadata_read {
     158    my $self = shift (@_); 
     159    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
     160
     161    my $outhandle = $self->{'outhandle'};
     162
     163    my $filename = $file;
     164    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
     165
     166    my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
     167
     168    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     169    return undef; # can't recognise
     170    }
     171
     172    # Do encoding stuff
     173    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
     174   
     175    # read in file ($text will be in utf8)
     176    my $text = "";
     177    $self->read_file ($filename, $encoding, $language, \$text);
     178
     179    $self->store_block_files (\$text, $filename);
     180   
     181    return 1;
     182}
     183
     184sub store_block_files
     185{
     186    my $self =shift (@_);
     187    my ($textref, $filename) = @_;
     188
     189    my $html_fname = $filename;
     190    my @file_blocks;
     191
     192    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     193    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     194    $$textref =~ s/$opencom(.*?)$closecom//gs;
     195
     196    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
     197    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     198    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
     199    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
     200   
     201
     202    foreach my $link (@img_matches, @usemap_matches, @link_matches) {
     203
     204    # remove quotes from link at start and end if necessary
     205    if ($link=~/^\"/) {
     206        $link=~s/^\"//;
     207        $link=~s/\"$//;
     208    }
     209
     210    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     211
     212    if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
     213        # Turn relative file path into full path
     214        my $dirname = &File::Basename::dirname($filename);
     215        $link = &util::filename_cat($dirname, $link);
     216    }
     217    $link = $self->eval_dir_dots($link);
     218    $self->{'file_blocks'}->{$link} = 1;
     219    }
     220}
     221 
    156222# do plugin specific processing of doc_obj
    157223sub process {
     
    178244    my $cursection = $doc_obj->get_top_section();
    179245
    180     $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 
     246    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    181247    unless $self->{'no_metadata'} || $self->{'description_tags'};
    182248
     
    189255
    190256    if ($self->{'description_tags'}) {
    191 
    192257    # remove the html header - note that doing this here means any
    193258    # sections defined within the header will be lost (so all <Section>
    194259    # tags must appear within the body of the HTML)
     260    my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
     261
    195262    $$textref =~ s/^.*?<body[^>]*>//is;
    196263    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     
    198265    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
    199266    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     267
    200268    my $lt = '(?:<|&lt;)';
    201269    my $gt = '(?:>|&gt;)';
     
    213281        }
    214282        while ($comment =~ s/$lt(.*?)$gt//s) {
    215        
    216283        my $tag = $1;
    217284        if ($tag eq "Section") {
     
    253320    if ($$textref =~ /\S/) {
    254321        if (!$found_something) {
    255         print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
    256         print $outhandle "          will be processed as a single section document\n";
    257        
     322        if ($self->{'verbosity'} > 2) {
     323            print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
     324            print $outhandle "          will be processed as a single section document\n";
     325        }
     326
    258327        # go ahead and process single-section document
    259328        $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     
    262331        # and extract metadata (this won't have been done
    263332        # above as the -description_tags option prevents it)
    264         $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
     333        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
     334        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
    265335            unless $self->{'no_metadata'};
    266336
     
    269339        print $outhandle "          of the final closing </Section> tag. This text will\n";
    270340        print $outhandle "          be ignored.";
     341
    271342        my ($text);
    272343        if (length($$textref) > 30) {
     
    280351    } elsif (!$found_something) {
    281352
    282         # may get to here if document contained no valid Section
    283         # tags but did contain some comments. The text will have
    284         # been processed already but we should print the warning
    285         # as above and extract metadata
    286         print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
    287         print $outhandle "          will be processed as a single section document\n";
    288 
    289         $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
     353        if ($self->{'verbosity'} > 2) {
     354        # may get to here if document contained no valid Section
     355        # tags but did contain some comments. The text will have
     356        # been processed already but we should print the warning
     357        # as above and extract metadata
     358        print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
     359        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
     360        }
     361
     362        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
     363        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
    290364        unless $self->{'no_metadata'};
    291365    }
     
    339413    my ($front, $link, $back, $base_dir,
    340414    $file, $doc_obj, $section) = @_;
     415
    341416    # remove quotes from link at start and end if necessary
    342417    if ($link=~/^\"/) {
     
    357432
    358433    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
    359 
     434   
    360435    my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
    361436    my $anchor_name = $img_file;
     
    387462
    388463    my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
    389 
     464   
    390465    ##### leave all these links alone (they won't be picked up by intermediate
    391466    ##### pages). I think that's safest when dealing with frames, targets etc.
     
    404479    &ghtml::urlsafe ($href);
    405480    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
    406    
    407481    } else {
    408482    # link is to some other type of file (eg image) so we'll
     
    434508    return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
    435509    }
    436 
    437510    if ($self->{'rename_assoc_files'}) {
    438511    if (defined $self->{'aux_files'}->{$href}) {
     
    446519    $doc_obj->associate_file($filename, $newname, undef, $section);
    447520    return "_httpdocimg_/$newname";
    448 
    449521    } else {
    450522    ($newname) = $filename =~ /([^\/\\]*)$/;
     
    460532
    461533    my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
    462 
     534   
    463535    $hash_part = "" if !defined $hash_part;
    464536    if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
     
    468540    return ($link, "", 0);
    469541    }
    470    
     542
    471543    if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
    472544    my $type = $1;
     
    481553                     
    482554    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
    483 
     555   
    484556    my $rl = 0;
    485557    $rl = 1 if (-e $linkfilename);
     
    519591              $before_hash=$win_before_hash;
    520592            }
    521 
    522593        }
    523594        else {
     
    525596            $before_hash =~ s@^$base_dir/@@;
    526597        }
    527 
    528598        }
    529 
    530599    } else {
    531600        # Turn relative file path into full path
     
    536605
    537606    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
    538 
    539607    # make sure there's a slash on the end if it's a directory
    540608    if ($before_hash !~ /\/$/) {
     
    543611   
    544612    return ("http://" . $before_hash, $hash_part, 1);
    545 
    546613    } else {
    547614    # mailto, news, nntp, telnet, javascript or gopher link
     
    611678    }
    612679
     680
    613681    # find the header in the html file, which has the meta tags
    614682    $$textref =~ m@<head>(.*?)</head>@si;
    615683
    616684    my $html_header=$1;
    617 
    618685    # go through every <meta... tag defined in the html and see if it is
    619686    # one of the tags we want to match.
     
    623690    # this assumes that ">" won't appear. (I don't think it's allowed to...)
    624691    $html_header =~ /^/; # match the start of the string, for \G assertion
    625   
     692 
    626693    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
    627694    my $metatag=$1;
     
    688755        $from = "<title> tags";
    689756    }
     757
    690758    if (!defined $title) {
    691759        $from = "first 100 chars";
     
    748816    my $self = shift (@_);
    749817    my ($filename) = @_;
    750 
    751818    my $dirsep_os = &util::get_os_dirsep();
    752819    my @dirsep = split(/$dirsep_os/,$filename);
     
    756823    if ($d eq "..") {
    757824        pop(@eval_dirs);
    758 
     825       
    759826    } elsif ($d eq ".") {
    760827        # do nothing!
     
    765832    }
    766833
     834    # Need to fiddle with number of elements in @eval_dirs if the
     835    # first one is the empty string.  This is because of a
     836    # modification to util::filename_cat that supresses the addition
     837    # of a leading '/' character (or \ if windows) (intended to help
     838    # filename cat with relative paths) if the first entry in the
     839    # array is the empty string.  Making the array start with *two*
     840    # empty strings is a way to defeat this "smart" option.
     841    #
     842    if (scalar(@eval_dirs) > 0) {
     843    if ($eval_dirs[0] eq ""){
     844        unshift(@eval_dirs,"");
     845    }
     846    }
    767847    return &util::filename_cat(@eval_dirs);
    768848}
Note: See TracChangeset for help on using the changeset viewer.