Changeset 10600

Show
Ignore:
Timestamp:
07.09.2005 09:18:27 (14 years ago)
Author:
chi
Message:

modifications for deal with document title (as the first H1 heading) and also store the
metadata title retrieved from document at the top_section.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm

    r10595 r10600  
    9292    print $outhandle "StructuredHTMLPlug: processing $file\n" 
    9393        if $self->{'verbosity'} > 1; 
     94     
    9495    my @head_and_body = split(/<body/i,$$textref); 
    9596    my $head = shift(@head_and_body); 
    9697    my $body_text = join("<body", @head_and_body); 
    97      
     98    $head =~ m/<title>(.+)<\/title>/i; 
     99    my $doctitle = $1 if defined $1; 
    98100    if (defined $self->{'extracted_word_metadata_fields'} && $self->{'extracted_word_metadata_fields'}=~ /\S/) { 
    99101    my @doc_properties = split(/<xml>/i,$head); 
     
    108110    # get rid of TOC and TOF sections and their title 
    109111    if ($self->{'delete_toc'} == 1){ 
    110     #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p> 
    111     # get rid of Table of Contents title and Table of Figures  
    112     # these two lines don't work - how can we do this properlly?? 
    113112    if (defined $self->{'toc_header'}&& $self->{'toc_header'} =~ /\S/){ 
    114113        $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg; 
     
    118117    } 
    119118    } 
    120  
     119     
    121120    if (defined $self->{'title_header'} && $self->{'title_header'}=~ /\S/){ 
    122121    $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is; 
    123     $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg; 
     122    #$body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg; 
     123    #$doctitle = $3; 
     124    $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg; 
     125    #$body_text =~ m/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/isg; 
     126    #$doctitle = "<h1>".$3."<\/h1>" if defined $3; 
    124127    } 
    125128 
     
    142145    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg; 
    143146     
     147    $section_text .= "<!--\n<Section>\n-->\n"; 
     148    #my $top_section_tag = "<!--\n<Section>\n-->\n"; 
     149    #$body_text =~ s/(<div.*)/$top_section_text$doctitle$1/i; 
     150    #$body_text =~ s/(<div.*)/$top_section_tag$1/i; 
    144151    my $body = "<body".$body_text; 
    145152     
    146153    my $section_text = $head; 
    147     $section_text .= "<!--\n<Section>\n-->\n"; 
    148154     
    149155    # split HTML text on <h1>, <h2> etc tags 
     
    203209            } 
    204210 
    205             my $spacing = "  " x $hnum;          
     211            my $spacing = "  " x $hnum; 
    206212            $section_text .= "<!--\n"; 
    207213            $section_text .= $spacing."<Section>\n"; 
     
    219225        else { 
    220226###     print STDERR "***** hc = <h$hc\n\n"; 
    221          
    222227        } 
    223228        $section_text .= "<h$hc"; 
     
    275280    $doc_obj->add_utf8_metadata ($cursection, "srcicon",  "_icondoc_"); 
    276281    $doc_obj->add_utf8_metadata ($cursection, "/srclink", "</a>"); 
    277  
    278     my $file_size = -s $filename; 
     282    $doc_obj->add_utf8_metadata ($cursection, "Title", $doctitle); 
     283    my $file_size = -s $filename; 
    279284    if ($file_size>1024) 
    280285    {