Changeset 2817


Ignore:
Timestamp:
2001-11-05T16:30:27+13:00 (22 years ago)
Author:
sjboddie
Message:

Implemented a description_tags option to HTMLPlug for splitting an HTML
document into subsections.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r2735 r2817  
    7575    print STDERR "                          Used by, for example, PDFHtml to remove Page 1 etc from text\n";
    7676    print STDERR "                          chosen to be used as the title.\n";
     77    print STDERR "   -description_tags      Split document into sub-sections where <Section> tags occur.\n";
     78    print STDERR "                          Note that by setting this option you implicitly set -no_metadata\n";
     79    print STDERR "                          as all metadata should be included within the <Section> tags.\n";
     80    print STDERR "                          Also, -keep_head will have no effect when this option is set.\n";
    7781}
    7882
     
    9296             q^rename_assoc_files^, \$self->{'rename_assoc_files'},
    9397             q^title_sub/.*/^, \$self->{'title_sub'},
     98             q^description_tags^, \$self->{'description_tags'},
    9499             "allow_extra_options")) {
    95100
     
    139144
    140145    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
    141     unless $self->{'no_metadata'};
     146    unless $self->{'no_metadata'} || $self->{'description_tags'};
    142147
    143148    # Store URL for page as metadata - this can be used for an
     
    149154    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    150155
     156    if ($self->{'description_tags'}) {
     157
     158    my $found_something = 0; my $top = 1;
     159    while ($$textref =~ s/^(.*?)<!--(.*?)-->//s) {
     160        my $text = $1;
     161        my $comment = $2;
     162        if (defined $text) {
     163        $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
     164        }
     165        while ($comment =~ s/<([^>]+)>//s) {
     166        my $tag = $1;
     167        if ($tag eq "Section") {
     168            $found_something = 1;
     169            $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
     170            $top = 0;
     171        } elsif ($tag eq "/Section") {
     172            $found_something = 1;
     173            $cursection = $doc_obj->get_parent_section ($cursection);
     174        } elsif ($tag =~ /^Metadata name=\"([^\"]+)\"/s) {
     175            my $metaname = $1;
     176            $comment =~ s/^(.*?)<\/Metadata>//s;
     177            my $metavalue = $1;
     178            $metavalue =~ s/^\s+//;
     179            $metavalue =~ s/\s+$//;
     180            $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
     181        }
     182        }
     183    }
     184    if ($cursection ne "") {
     185        print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
     186    }
     187
     188    $$textref =~ s/^.*?<body[^>]*>//is;
     189    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     190    if ($$textref =~ /\S/) {
     191        if (!$found_something) {
     192        print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
     193        print $outhandle "          will be processed as a single section document\n";
     194        $self->process_section($$textref, $base_dir, $file, $doc_obj, $cursection);
     195        } else {
     196        print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
     197        print $outhandle "          of the final closing </Section> tag. This text will\n";
     198        print $outhandle "          be ignored.";
     199        if (length($$textref) > 30) {
     200            $text = substr($$textref, 0, 30) . "...";
     201        }
     202        $text =~ s/\n/ /isg;
     203        print $outhandle " ($text)\n";
     204        }
     205    }
     206   
     207    } else {
     208    # single section document
     209    $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     210    }
     211    return 1;
     212}
     213
     214# note that process_section may be called multiple times for a single
     215# section (relying on the fact that add_utf8_text appends the text to any
     216# that may exist already).
     217sub process_section {
     218    my $self = shift (@_);
     219    my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
     220
    151221    # remove header and footer
    152     if (!$self->{'keep_head'}) {
     222    if (!$self->{'keep_head'} || $self->{'description_tags'}) {
    153223    $$textref =~ s/^.*?<body[^>]*>//is;
    154224    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     
    174244    # add text to document object
    175245    $doc_obj->add_utf8_text($cursection, $$textref);
    176 
    177     return 1;
    178 }
    179 
    180 
    181 
     246}
    182247
    183248sub replace_images {
Note: See TracChangeset for help on using the changeset viewer.