Ignore:
Timestamp:
2005-09-07T09:18:27+12:00 (18 years ago)
Author:
chi
Message:

modifications for deal with document title (as the first H1 heading) and also store the
metadata title retrieved from document at the top_section.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm

    r10595 r10600  
    9292    print $outhandle "StructuredHTMLPlug: processing $file\n"
    9393        if $self->{'verbosity'} > 1;
     94   
    9495    my @head_and_body = split(/<body/i,$$textref);
    9596    my $head = shift(@head_and_body);
    9697    my $body_text = join("<body", @head_and_body);
    97    
     98    $head =~ m/<title>(.+)<\/title>/i;
     99    my $doctitle = $1 if defined $1;
    98100    if (defined $self->{'extracted_word_metadata_fields'} && $self->{'extracted_word_metadata_fields'}=~ /\S/) {
    99101    my @doc_properties = split(/<xml>/i,$head);
     
    108110    # get rid of TOC and TOF sections and their title
    109111    if ($self->{'delete_toc'} == 1){
    110     #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
    111     # get rid of Table of Contents title and Table of Figures
    112     # these two lines don't work - how can we do this properlly??
    113112    if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){
    114113        $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
     
    118117    }
    119118    }
    120 
     119   
    121120    if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){
    122121    $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
    123     $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
     122    #$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
     123    #$doctitle = $3;
     124    $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
     125    #$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg;
     126    #$doctitle = "<h1>".$3."<\/h1>" if defined $3;
    124127    }
    125128
     
    142145    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
    143146   
     147    $section_text .= "<!--\n<Section>\n-->\n";
     148    #my $top_section_tag = "<!--\n<Section>\n-->\n";
     149    #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i;
     150    #$body_text =~ s/(<div.*)/$top_section_tag$1/i;
    144151    my $body = "<body".$body_text;
    145152   
    146153    my $section_text = $head;
    147     $section_text .= "<!--\n<Section>\n-->\n";
    148154   
    149155    # split HTML text on <h1>, <h2> etc tags
     
    203209            }
    204210
    205             my $spacing = "  " x $hnum;         
     211            my $spacing = "  " x $hnum;
    206212            $section_text .= "<!--\n";
    207213            $section_text .= $spacing."<Section>\n";
     
    219225        else {
    220226###     print STDERR "***** hc = <h$hc\n\n";
    221        
    222227        }
    223228        $section_text .= "<h$hc";
     
    275280    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icondoc_");
    276281    $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>");
    277 
    278     my $file_size = -s $filename;
     282    $doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle);
     283    my $file_size = -s $filename;
    279284    if ($file_size>1024)
    280285    {
Note: See TracChangeset for help on using the changeset viewer.