Changeset 14012


Ignore:
Timestamp:
2007-04-16T15:43:20+12:00 (17 years ago)
Author:
cvs_anon
Message:

modify HTMLPlug to convert old HDL section style to new HDL section style

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r13968 r14012  
    111111        'desc' => "{HTMLPlug.tidy_html}",
    112112    'type' => "flag"},
     113      { 'name' => "old_style_HDL",
     114        'desc' => "{HTMLPlug.old_style_HDL}",
     115    'type' => "flag"}   
    113116      ];
    114117
     
    119122        'args'     => $arguments };
    120123
    121 # Will make the html input file as a proper XML file with removed font tag and
    122 # image size added to the img tag.
    123 # The tidying process takes place in a collection specific 'tmp' directory so
    124 # that we don't accidentally damage the input.
    125 sub tmp_tidy_file
     124
     125sub HB_read_html_file {
     126    my $self = shift (@_);
     127    my ($htmlfile, $text) = @_;
     128       
     129    # load in the file
     130    if (!open (FILE, $htmlfile)) {
     131    print STDERR "ERROR - could not open $htmlfile\n";
     132    return;
     133    }
     134
     135    my $foundbody = 0;
     136    $self->HB_gettext (\$foundbody, $text, "FILE");
     137    close FILE;
     138   
     139    # just in case there was no <body> tag
     140    if (!$foundbody) {
     141    $foundbody = 1;
     142    open (FILE, $htmlfile) || return;
     143    $self->HB_gettext (\$foundbody, $text, "FILE");
     144    close FILE;
     145    }
     146    # text is in utf8
     147}       
     148
     149# converts the text to utf8, as ghtml does that for &eacute; etc.
     150sub HB_gettext {
     151    my $self = shift (@_);
     152    my ($foundbody, $text, $handle) = @_;
     153   
     154    my $line = "";
     155    while (defined ($line = <$handle>)) {
     156    # look for body tag
     157    if (!$$foundbody) {
     158        if ($line =~ s/^.*<body[^>]*>//i) {
     159        $$foundbody = 1;
     160        } else {
     161        next;
     162        }
     163    }
     164   
     165    # check for symbol fonts
     166    if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
     167        my $font = $1;
     168        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
     169        if ($font !~ /^arial$/i);
     170    }
     171
     172    $$text .= $line;
     173    }
     174
     175    if ($self->{'input_encoding'} eq "iso_8859_1") {
     176    # convert to utf-8
     177    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
     178    }
     179    # convert any alphanumeric character entities to their utf-8
     180    # equivalent for indexing purposes
     181    &ghtml::convertcharentities ($$text);
     182
     183    $$text =~ s/\s+/ /g; # remove \n's
     184}
     185
     186sub HB_clean_section {
     187    my $self = shift (@_);
     188    my ($section) = @_;
     189
     190    # remove tags without a starting tag from the section
     191    my ($tag, $tagstart);
     192    while ($section =~ /<\/([^>]{1,10})>/) {
     193    $tag = $1;
     194    $tagstart = index($section, "<$tag");
     195    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
     196    $section =~ s/<\/$tag>//;
     197    }
     198   
     199    # remove extra paragraph tags
     200    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
     201   
     202    # remove extra stuff at the end of the section
     203    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
     204   
     205    # add a newline at the beginning of each paragraph
     206    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
     207   
     208    # add a newline every 80 characters at a word boundary
     209    # Note: this regular expression puts a line feed before
     210    # the last word in each section, even when it is not
     211    # needed.
     212    $section =~ s/(.{1,80})\s/$1\n/g;
     213   
     214    # fix up the image links
     215    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
     216    <center><img src=\"$1\"><\/center><br>/ig;
     217    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
     218    <center><img src=\"$1\"><\/center><br>/ig;
     219
     220    return $section;
     221}
     222       
     223# Will convert the oldHDL format to the new HDL format (using the Section tag) 
     224sub convert_to_newHDLformat
     225{
     226   my $self = shift (@_);
     227   my ($file,$cnfile) = @_;
     228   my $input_filename = $file;
     229   my $tmp_filename = $cnfile;
     230   
     231   # write HTML tmp file with new HDL format
     232   open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     233   
     234   # read in the file and do basic html cleaning (removing header etc)
     235   my $html = "";
     236   $self->HB_read_html_file ($input_filename, \$html);
     237   
     238    # process the file one section at a time
     239    my $curtoclevel = 1;
     240    my $firstsection = 1;
     241    my $toclevel = 0;
     242    while (length ($html) > 0) {
     243    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
     244        $toclevel = $3;
     245        my $title = $4;
     246        my $sectiontext = "";
     247        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
     248        $sectiontext = $1;
     249        } else {
     250        $sectiontext = $html;
     251        $html = "";
     252        }
     253
     254        # remove tags and extra spaces from the title
     255        $title =~ s/<\/?[^>]+>//g;
     256        $title =~ s/^\s+|\s+$//g;
     257
     258        # close any sections below the current level and
     259        # create a new section (special case for the firstsection)
     260        print PROD "<!--\n";
     261        while (($curtoclevel > $toclevel) ||
     262           (!$firstsection && $curtoclevel == $toclevel)) {
     263        $curtoclevel--;
     264        print PROD "</Section>\n";
     265        }
     266        if ($curtoclevel+1 < $toclevel) {
     267        print STDERR "WARNING - jump in toc levels in $input_filename " .
     268            "from $curtoclevel to $toclevel\n";
     269        }
     270        while ($curtoclevel < $toclevel) {
     271        $curtoclevel++;
     272        }
     273
     274        if ($curtoclevel == 1) {
     275            # add the header tag
     276        print PROD "-->\n";
     277            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
     278        print PROD "<!--\n";
     279        }
     280       
     281        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
     282       
     283        print PROD "-->\n";
     284         
     285        # clean up the section html
     286        $sectiontext = $self->HB_clean_section($sectiontext);
     287
     288        print PROD "$sectiontext\n";     
     289
     290    } else {
     291        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
     292        "\nin $input_filename\n";
     293        last;
     294    }
     295    $firstsection = 0;
     296    }
     297   
     298    print PROD "<!--\n";
     299    while (($curtoclevel > $toclevel) ||
     300       (!$firstsection && $curtoclevel == $toclevel)) {
     301    $curtoclevel--;
     302    print PROD "</Section>\n";
     303    }
     304    print PROD "</Section>\n";
     305    print PROD "-->\n";
     306   
     307    close (PROD) || die("Error Closing File: $tmp_filename $!");
     308   
     309    return $tmp_filename;
     310}       
     311
     312sub convert_tidy_or_oldHDL_file
    126313{
    127314    my $self = shift (@_);
     
    160347    $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
    161348    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
    162      
     349   
    163350    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
    164    
    165     # tidy the input file if it is a HTML-like file
     351    
     352    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
    166353    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
     354    {   
     355        # convert the input file to a new style HDL
     356        my $hdl_output_filename = $input_filename;
     357        if ($self->{'old_style_HDL'})
     358        {
     359        $hdl_output_filename = &util::filename_cat($tmp_dirname, "newHDL_$tailname$suffix");
     360        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
     361        }
     362   
     363    # tidy the input file
     364    my $tidy_output_filename = $hdl_output_filename;
     365    if ($self->{'tidy_html'})
     366    {
     367        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     368        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
     369    }
     370   
     371    $tmp_filename = $tidy_output_filename;
     372   
     373    # just for checking copy all other file from the base dir to tmp dir if it is not exists
     374    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
     375    my @files = grep {!/^\.+$/} readdir(DIR);
     376    close(DIR);
     377
     378    foreach my $file (@files)
     379    {
     380        my $src_file = &util::filename_cat($base_dirname,$file);
     381        my $dest_file = &util::filename_cat($tmp_dirname,$file);
     382        if ((!-e $dest_file) && (!-d $src_file))
     383        {
     384            # just copy the original file back to the tmp directory
     385                open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
     386                open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
     387                print TIDYOUT <TIDYIN>;
     388                close TIDYIN;
     389                close TIDYOUT;
     390        }
     391    }
     392    }
     393    else
    167394    {
     395        if (!-e $tmp_filename)
     396    {
     397            # just copy the original file back to the tmp directory
     398            open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
     399            open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
     400            print TIDYOUT <TIDYIN>;
     401            close TIDYIN;
     402            close TIDYOUT;
     403    }
     404    }
     405   
     406    return $tmp_filename;
     407}
     408
     409   
     410# Will make the html input file as a proper XML file with removed font tag and
     411# image size added to the img tag.
     412# The tidying process takes place in a collection specific 'tmp' directory so
     413# that we don't accidentally damage the input.
     414sub tmp_tidy_file
     415{
     416    my $self = shift (@_);
     417    my ($file,$cnfile) = @_;
     418    my $input_filename = $file;
     419    my $tmp_filename = $cnfile;
     420 
     421    # get the input filename
     422    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     423   
    168424        # create HTML parser to decode the input file
    169425        my $parser = HTML::TokeParser::Simple->new($input_filename);
     
    214470        print PROD $tidyfile;
    215471        close (PROD) || die("Error Closing File: $tmp_filename $!");
    216    
    217     # just for checking copy all other file from the base dir to tmp dir if it is not exists
    218     opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
    219     my @files = grep {!/^\.+$/} readdir(DIR);
    220     close(DIR);
    221 
    222     foreach my $file (@files)
    223     {
    224         my $src_file = &util::filename_cat($base_dirname,$file);
    225         my $dest_file = &util::filename_cat($tmp_dirname,$file);
    226         if ((!-e $dest_file) && (!-d $src_file))
    227         {
    228             # just copy the original file back to the tmp directory
    229                 open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
    230                 open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
    231                 print TIDYOUT <TIDYIN>;
    232                 close TIDYIN;
    233                 close TIDYOUT;
    234         }
    235     }
    236     }
    237     else
    238     {
    239         if (!-e $tmp_filename)
    240     {
    241             # just copy the original file back to the tmp directory
    242             open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
    243             open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
    244             print TIDYOUT <TIDYIN>;
    245             close TIDYIN;
    246             close TIDYOUT;
    247     }
    248     }
    249472   
    250473    # return the output filename
     
    257480    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    258481   
     482    # check the process_exp and block_exp thing
     483    my ($block_status,$filename) = $self->read_block(@_);   
     484    return $block_status if ((!defined $block_status) || ($block_status==0));
     485   
    259486    # get the input file
    260487    my $input_filename = $file;
     
    262489    $suffix = lc($suffix);   
    263490   
    264     if ($self->{'tidy_html'})
     491    if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'}))
    265492    {
    266         # tidy the input file if it is a HTML-like file
    267         #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
    268         #{
    269493        # set the file to be tidied
    270494            $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
    271495       
    272496            # get the tidied file
    273             my $tidy_filename = $self->tmp_tidy_file($input_filename);
     497            #my $tidy_filename = $self->tmp_tidy_file($input_filename);
     498        my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
    274499       
    275500            # derive tmp filename from input filename
     
    279504        $file = "$tailname$suffix";
    280505        $base_dir = $dirname;
    281     #}
    282506    }
    283507   
  • trunk/gsdl/perllib/strings.properties

    r13968 r14012  
    819819HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format.
    820820
     821HTMLPlug.old_style_HDL:To mark whether the file in this collection is sectionalized using the old HDL's section style.
     822
    821823ImagePlug.converttotype:Convert main image to format 's'.
    822824
Note: See TracChangeset for help on using the changeset viewer.