Changeset 20774


Ignore:
Timestamp:
2009-10-05T15:43:00+13:00 (15 years ago)
Author:
kjdon
Message:

moved some of the horrible old methods to the end of the file so that the important ones come first

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/HTMLPlugin.pm

    r20689 r20774  
    124124
    125125
    126 sub HB_read_html_file {
    127     my $self = shift (@_);
    128     my ($htmlfile, $text) = @_;
    129    
    130     # load in the file
    131     if (!open (FILE, $htmlfile)) {
    132     print STDERR "ERROR - could not open $htmlfile\n";
    133     return;
    134     }
    135 
    136     my $foundbody = 0;
    137     $self->HB_gettext (\$foundbody, $text, "FILE");
    138     close FILE;
    139    
    140     # just in case there was no <body> tag
    141     if (!$foundbody) {
    142     $foundbody = 1;
    143     open (FILE, $htmlfile) || return;
    144     $self->HB_gettext (\$foundbody, $text, "FILE");
    145     close FILE;
    146     }
    147     # text is in utf8
    148 }       
    149 
    150 # converts the text to utf8, as ghtml does that for &eacute; etc.
    151 sub HB_gettext {
    152     my $self = shift (@_);
    153     my ($foundbody, $text, $handle) = @_;
    154    
    155     my $line = "";
    156     while (defined ($line = <$handle>)) {
    157     # look for body tag
    158     if (!$$foundbody) {
    159         if ($line =~ s/^.*<body[^>]*>//i) {
    160         $$foundbody = 1;
    161         } else {
    162         next;
    163         }
    164     }
    165    
    166     # check for symbol fonts
    167     if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
    168         my $font = $1;
    169         print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
    170         if ($font !~ m/^arial$/i);
    171     }
    172 
    173     $$text .= $line;
    174     }
    175 
    176     if ($self->{'input_encoding'} eq "iso_8859_1") {
    177     # convert to utf-8
    178     $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
    179     }
    180     # convert any alphanumeric character entities to their utf-8
    181     # equivalent for indexing purposes
    182     #&ghtml::convertcharentities ($$text);
    183 
    184     $$text =~ s/\s+/ /g; # remove \n's
    185 }
    186 
    187 sub HB_clean_section {
    188     my $self = shift (@_);
    189     my ($section) = @_;
    190 
    191     # remove tags without a starting tag from the section
    192     my ($tag, $tagstart);
    193     while ($section =~ m/<\/([^>]{1,10})>/) {
    194     $tag = $1;
    195     $tagstart = index($section, "<$tag");
    196     last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
    197     $section =~ s/<\/$tag>//;
    198     }
    199    
    200     # remove extra paragraph tags
    201     while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
    202    
    203     # remove extra stuff at the end of the section
    204     while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
    205    
    206     # add a newline at the beginning of each paragraph
    207     $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
    208    
    209     # add a newline every 80 characters at a word boundary
    210     # Note: this regular expression puts a line feed before
    211     # the last word in each section, even when it is not
    212     # needed.
    213     $section =~ s/(.{1,80})\s/$1\n/g;
    214    
    215     # fix up the image links
    216     $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
    217     <center><img src=\"$1\" \/><\/center><br\/>/ig;
    218     $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
    219     <center><img src=\"$1\" \/><\/center><br\/>/ig;
    220 
    221     return $section;
    222 }
    223 
    224 # Will convert the oldHDL format to the new HDL format (using the Section tag) 
    225 sub convert_to_newHDLformat
     126sub new {
     127    my ($class) = shift (@_);
     128    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
     129    push(@$pluginlist, $class);
     130   
     131    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
     132    push(@{$hashArgOptLists->{"OptList"}},$options);
     133   
     134
     135    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
     136   
     137    if ($self->{'w3mir'}) {
     138    $self->{'file_is_url'} = 1;
     139    }
     140    $self->{'aux_files'} = {};
     141    $self->{'dir_num'} = 0;
     142    $self->{'file_num'} = 0;
     143   
     144    return bless $self, $class;
     145}
     146
     147# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
     148# if have eg <script language="javascript" src="img/lib.js@123">
     149sub get_default_block_exp {
     150    my $self = shift (@_);
     151   
     152    #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
     153    return "";
     154}
     155
     156sub get_default_process_exp {
     157    my $self = shift (@_);
     158   
     159    # the last option is an attempt to encode the concept of an html query ...
     160    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
     161}
     162
     163sub store_block_files
    226164{
    227     my $self = shift (@_);
    228     my ($file,$cnfile) = @_;
    229     my $input_filename = $file;
    230     my $tmp_filename = $cnfile;
    231    
    232     # write HTML tmp file with new HDL format
    233     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
    234    
    235     # read in the file and do basic html cleaning (removing header etc)
    236     my $html = "";
    237     $self->HB_read_html_file ($input_filename, \$html);
    238    
    239     # process the file one section at a time
    240     my $curtoclevel = 1;
    241     my $firstsection = 1;
    242     my $toclevel = 0;
    243     while (length ($html) > 0) {
    244     if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
    245         $toclevel = $3;
    246         my $title = $4;
    247         my $sectiontext = "";
    248         if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
    249         $sectiontext = $1;
    250         } else {
    251         $sectiontext = $html;
    252         $html = "";
    253         }
    254 
    255         # remove tags and extra spaces from the title
    256         $title =~ s/<\/?[^>]+>//g;
    257         $title =~ s/^\s+|\s+$//g;
    258 
    259         # close any sections below the current level and
    260         # create a new section (special case for the firstsection)
    261         print PROD "<!--\n";
    262         while (($curtoclevel > $toclevel) ||
    263            (!$firstsection && $curtoclevel == $toclevel)) {
    264         $curtoclevel--;
    265         print PROD "</Section>\n";
    266         }
    267         if ($curtoclevel+1 < $toclevel) {
    268         print STDERR "WARNING - jump in toc levels in $input_filename " .
    269             "from $curtoclevel to $toclevel\n";
    270         }
    271         while ($curtoclevel < $toclevel) {
    272         $curtoclevel++;
    273         }
    274 
    275         if ($curtoclevel == 1) {
    276             # add the header tag
    277         print PROD "-->\n";
    278             print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
    279         print PROD "<!--\n";
    280         }
    281        
    282         print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
    283        
    284         print PROD "-->\n";
    285        
    286         # clean up the section html
    287         $sectiontext = $self->HB_clean_section($sectiontext);
    288 
    289         print PROD "$sectiontext\n";     
    290 
    291     } else {
    292         print STDERR "WARNING - leftover text\n" , $self->shorten($html),
    293         "\nin $input_filename\n";
    294         last;
    295     }
    296     $firstsection = 0;
    297     }
    298    
    299     print PROD "<!--\n";
    300     while ($curtoclevel > 0) {
    301     $curtoclevel--;
    302     print PROD "</Section>\n";
    303     }
    304     print PROD "-->\n";
    305    
    306     close (PROD) || die("Error Closing File: $tmp_filename $!");
    307    
    308     return $tmp_filename;
    309 }       
    310 
    311 sub shorten {
    312     my $self = shift (@_);
    313     my ($text) = @_;
    314 
    315     return "\"$text\"" if (length($text) < 100);
    316 
    317     return "\"" . substr ($text, 0, 50) . "\" ... \"" .
    318     substr ($text, length($text)-50) . "\"";
    319 }
    320 
    321 sub convert_tidy_or_oldHDL_file
    322 {
    323     my $self = shift (@_);
    324     my ($file) = @_;
    325     my $input_filename = $file;
    326    
    327     if (-d $input_filename)
    328     {
    329         return $input_filename;
    330     }
    331    
    332     # get the input filename
    333     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
    334     my $base_dirname = $dirname;
    335     $suffix = lc($suffix);   
    336    
    337     # derive tmp filename from input filename
    338     # Remove any white space from filename -- no risk of name collision, and
    339     # makes later conversion by utils simpler. Leave spaces in path...
    340     # tidy up the filename with space, dot, hyphen between
    341     $tailname =~ s/\s+//g;
    342     $tailname =~ s/\.+//g;
    343     $tailname =~ s/\-+//g;
    344     # convert to utf-8 otherwise we have problems with the doc.xml file
    345     # later on
    346     &unicode::ensure_utf8(\$tailname);
    347    
    348     # softlink to collection tmp dir
    349     my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
    350     &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
    351    
    352     my $test_dirname = "";
    353     my $f_separator = &util::get_os_dirsep();
    354    
    355     if ($dirname =~ m/import$f_separator/)
    356     {
    357         $test_dirname = $'; #'
    358    
    359     #print STDERR "init $'\n";
    360    
    361     while ($test_dirname =~ m/[$f_separator]/)
    362     {
    363         my $folderdirname = $`;
    364         $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
    365         &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
    366         $test_dirname = $'; #'
    367     }
    368     }
    369    
    370     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
    371    
    372     # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
    373     if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
    374     {   
    375         #convert the input file to a new style HDL
    376         my $hdl_output_filename = $input_filename;
    377         if ($self->{'old_style_HDL'})
    378         {
    379         $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
    380         $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
    381         }
    382    
    383     #just for checking copy all other file from the base dir to tmp dir if it is not exists
    384     opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
    385     my @files = grep {!/^\.+$/} readdir(DIR);
    386     close(DIR);
    387 
    388     foreach my $file (@files)
    389     {
    390         my $src_file = &util::filename_cat($base_dirname,$file);
    391         my $dest_file = &util::filename_cat($tmp_dirname,$file);
    392         if ((!-e $dest_file) && (!-d $src_file))
    393         {
    394         # just copy the original file back to the tmp directory
    395         copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
    396         }
    397     }
    398    
    399     # tidy the input file
    400     my $tidy_output_filename = $hdl_output_filename;
    401     if ($self->{'use_realistic_book'})
    402     {
    403         $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
    404         $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
    405     }
    406     $tmp_filename = $tidy_output_filename;
    407     }
    408     else
    409     {
    410         if (!-e $tmp_filename)
    411     {
    412         # just copy the original file back to the tmp directory
    413         copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
    414     }
    415     }
    416    
    417     return $tmp_filename;
    418 }
    419 
    420 
    421 # Will make the html input file as a proper XML file with removed font tag and
    422 # image size added to the img tag.
    423 # The tidying process takes place in a collection specific 'tmp' directory so
    424 # that we don't accidentally damage the input.
    425 sub tmp_tidy_file
    426 {
    427     my $self = shift (@_);
    428     my ($file,$cnfile) = @_;
    429     my $input_filename = $file;
    430     my $tmp_filename = $cnfile;
    431    
    432     # get the input filename
    433     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
    434 
    435     require HTML::TokeParser::Simple;
    436    
    437     # create HTML parser to decode the input file
    438     my $parser = HTML::TokeParser::Simple->new($input_filename);
    439 
    440     # write HTML tmp file without the font tag and image size are added to the img tag
    441     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
    442     while (my $token = $parser->get_token())
    443     {
    444     # is it an img tag
    445     if ($token->is_start_tag('img'))
    446     {
    447         # get the attributes
    448         my $attr = $token->return_attr;
    449 
    450         # get the full path to the image
    451         my $img_file = &util::filename_cat($dirname,$attr->{src});
    452        
    453         # set the width and height attribute
    454         ($attr->{width}, $attr->{height}) = imgsize($img_file);
    455 
    456         # recreate the tag
    457         print PROD "<img";
    458         print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
    459         print PROD ">";
    460     }
    461     # is it a font tag
    462     else
    463     {
    464         if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
    465         {
    466         # remove font tag
    467         print PROD "";
    468         }
    469         else
    470         {
    471         # print without changes
    472         print PROD $token->as_is;
    473         }
    474     }
    475     }
    476     close (PROD) || die("Error Closing File: $tmp_filename $!");
    477 
    478     # run html-tidy on the tmp file to make it a proper XML file
    479     my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
    480 
    481     # write result back to the tmp file
    482     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
    483     print PROD $tidyfile;
    484     close (PROD) || die("Error Closing File: $tmp_filename $!");
    485    
    486     # return the output filename
    487     return $tmp_filename;
    488 }
     165    my $self =shift (@_);
     166    my ($filename_full_path, $block_hash) = @_;
     167
     168    my $html_fname = $filename_full_path;
     169    my @file_blocks;
     170   
     171    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     172
     173    # read in file ($text will be in utf8)
     174    my $raw_text = "";
     175    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
     176
     177    my $textref = \$raw_text;
     178    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     179    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     180    $$textref =~ s/$opencom(.*?)$closecom//gs;
     181
     182    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
     183    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     184    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
     185    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
     186    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     187    my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
     188    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     189
     190    if(!defined $self->{'utf8_to_original_filename'}) {
     191    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
     192    $self->{'utf8_to_original_filename'} = {};
     193    }
     194
     195    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     196
     197    # remove quotes from link at start and end if necessary
     198    if ($link=~/^\"/) {
     199        $link=~s/^\"//;
     200        $link=~s/\"$//;
     201    }
     202
     203    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     204    # some links may just be anchor names
     205    next unless ($link =~ /\S+/);
     206
     207    if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
     208        # Turn relative file path into full path
     209        my $dirname = &File::Basename::dirname($filename_full_path);
     210        $link = &util::filename_cat($dirname, $link);
     211    }
     212    $link = $self->eval_dir_dots($link);
     213
     214    # this is the actual filename on the filesystem (that the link refers to)
     215    my $url_original_filename = $self->opt_url_decode($link);
     216
     217    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
     218    my $utf8_link = "";
     219    $self->decode_text($link,$encoding,$language,\$utf8_link);
     220
     221    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
     222#   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
     223
     224    if ($url_original_filename ne $utf8_link) {
     225        my $outhandle = $self->{'outhandle'};
     226
     227        print $outhandle "URL Encoding $url_original_filename\n";
     228        print $outhandle " ->$utf8_link\n";
     229    }
     230
     231    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
     232    }
     233}
     234
     235# Given a filename in any encoding, will URL decode it to get back the original filename
     236# in the original encoding. Because this method is intended to work out the *original*
     237# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
     238# string already exists in the local folder.
     239# Return the original filename corresponding to the parameter URL-encoded filename, and
     240# a decoded flag that is set to true iff URL-decoding had to be applied.
     241sub opt_url_decode {
     242    my $self = shift (@_);
     243    my ($link) = @_;
     244
     245    # Replace %XX's in URL with decoded value if required.
     246    # Note that the filename may include the %XX in some situations
     247    if ($link =~ m/\%[A-F0-9]{2}/i) {
     248    if (!-e $link) {
     249        $link = &unicode::url_decode($link);
     250    }
     251    }
     252
     253    return $link;
     254}
    489255
    490256sub read_into_doc_obj
     
    523289    return ($process_status,$doc_obj);
    524290}
    525 
    526 sub new {
    527     my ($class) = shift (@_);
    528     my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    529     push(@$pluginlist, $class);
    530    
    531     push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    532     push(@{$hashArgOptLists->{"OptList"}},$options);
    533    
    534 
    535     my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
    536    
    537     if ($self->{'w3mir'}) {
    538     $self->{'file_is_url'} = 1;
    539     }
    540     $self->{'aux_files'} = {};
    541     $self->{'dir_num'} = 0;
    542     $self->{'file_num'} = 0;
    543    
    544     return bless $self, $class;
    545 }
    546 
    547 # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
    548 # if have eg <script language="javascript" src="img/lib.js@123">
    549 sub get_default_block_exp {
    550     my $self = shift (@_);
    551    
    552     #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
    553     return "";
    554 }
    555 
    556 sub get_default_process_exp {
    557     my $self = shift (@_);
    558    
    559     # the last option is an attempt to encode the concept of an html query ...
    560     return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
    561 }
    562 
    563 sub store_block_files
    564 {
    565     my $self =shift (@_);
    566     my ($filename_full_path, $block_hash) = @_;
    567 
    568     my $html_fname = $filename_full_path;
    569     my @file_blocks;
    570    
    571     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
    572 
    573     # read in file ($text will be in utf8)
    574     my $raw_text = "";
    575     $self->read_file_no_decoding ($filename_full_path, \$raw_text);
    576 
    577     my $textref = \$raw_text;
    578     my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
    579     my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
    580     $$textref =~ s/$opencom(.*?)$closecom//gs;
    581 
    582     my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
    583     my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    584     my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
    585     my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
    586     my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    587     my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
    588     my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    589 
    590     if(!defined $self->{'utf8_to_original_filename'}) {
    591     # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
    592     $self->{'utf8_to_original_filename'} = {};
    593     }
    594 
    595     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
    596 
    597     # remove quotes from link at start and end if necessary
    598     if ($link=~/^\"/) {
    599         $link=~s/^\"//;
    600         $link=~s/\"$//;
    601     }
    602 
    603     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
    604     # some links may just be anchor names
    605     next unless ($link =~ /\S+/);
    606 
    607     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
    608         # Turn relative file path into full path
    609         my $dirname = &File::Basename::dirname($filename_full_path);
    610         $link = &util::filename_cat($dirname, $link);
    611     }
    612     $link = $self->eval_dir_dots($link);
    613 
    614     # this is the actual filename on the filesystem (that the link refers to)
    615     my $url_original_filename = $self->opt_url_decode($link);
    616 
    617     # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
    618     my $utf8_link = "";
    619     $self->decode_text($link,$encoding,$language,\$utf8_link);
    620 
    621     $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
    622 #   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
    623 
    624     if ($url_original_filename ne $utf8_link) {
    625         my $outhandle = $self->{'outhandle'};
    626 
    627         print $outhandle "URL Encoding $url_original_filename\n";
    628         print $outhandle " ->$utf8_link\n";
    629     }
    630 
    631     $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
    632     }
    633 }
    634 
    635 # Given a filename in any encoding, will URL decode it to get back the original filename
    636 # in the original encoding. Because this method is intended to work out the *original*
    637 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
    638 # string already exists in the local folder.
    639 # Return the original filename corresponding to the parameter URL-encoded filename, and
    640 # a decoded flag that is set to true iff URL-decoding had to be applied.
    641 sub opt_url_decode {
    642     my $self = shift (@_);
    643     my ($link) = @_;
    644 
    645     # Replace %XX's in URL with decoded value if required.
    646     # Note that the filename may include the %XX in some situations
    647     if ($link =~ m/\%[A-F0-9]{2}/i) {
    648     if (!-e $link) {
    649         $link = &unicode::url_decode($link);
    650     }
    651     }
    652 
    653     return $link;
    654 }
    655 
    656291
    657292# do plugin specific processing of doc_obj
     
    15471182}
    15481183
     1184sub HB_read_html_file {
     1185    my $self = shift (@_);
     1186    my ($htmlfile, $text) = @_;
     1187   
     1188    # load in the file
     1189    if (!open (FILE, $htmlfile)) {
     1190    print STDERR "ERROR - could not open $htmlfile\n";
     1191    return;
     1192    }
     1193
     1194    my $foundbody = 0;
     1195    $self->HB_gettext (\$foundbody, $text, "FILE");
     1196    close FILE;
     1197   
     1198    # just in case there was no <body> tag
     1199    if (!$foundbody) {
     1200    $foundbody = 1;
     1201    open (FILE, $htmlfile) || return;
     1202    $self->HB_gettext (\$foundbody, $text, "FILE");
     1203    close FILE;
     1204    }
     1205    # text is in utf8
     1206}       
     1207
     1208# converts the text to utf8, as ghtml does that for &eacute; etc.
     1209sub HB_gettext {
     1210    my $self = shift (@_);
     1211    my ($foundbody, $text, $handle) = @_;
     1212   
     1213    my $line = "";
     1214    while (defined ($line = <$handle>)) {
     1215    # look for body tag
     1216    if (!$$foundbody) {
     1217        if ($line =~ s/^.*<body[^>]*>//i) {
     1218        $$foundbody = 1;
     1219        } else {
     1220        next;
     1221        }
     1222    }
     1223   
     1224    # check for symbol fonts
     1225    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
     1226        my $font = $1;
     1227        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
     1228        if ($font !~ m/^arial$/i);
     1229    }
     1230
     1231    $$text .= $line;
     1232    }
     1233
     1234    if ($self->{'input_encoding'} eq "iso_8859_1") {
     1235    # convert to utf-8
     1236    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
     1237    }
     1238    # convert any alphanumeric character entities to their utf-8
     1239    # equivalent for indexing purposes
     1240    #&ghtml::convertcharentities ($$text);
     1241
     1242    $$text =~ s/\s+/ /g; # remove \n's
     1243}
     1244
     1245sub HB_clean_section {
     1246    my $self = shift (@_);
     1247    my ($section) = @_;
     1248
     1249    # remove tags without a starting tag from the section
     1250    my ($tag, $tagstart);
     1251    while ($section =~ m/<\/([^>]{1,10})>/) {
     1252    $tag = $1;
     1253    $tagstart = index($section, "<$tag");
     1254    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
     1255    $section =~ s/<\/$tag>//;
     1256    }
     1257   
     1258    # remove extra paragraph tags
     1259    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
     1260   
     1261    # remove extra stuff at the end of the section
     1262    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
     1263   
     1264    # add a newline at the beginning of each paragraph
     1265    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
     1266   
     1267    # add a newline every 80 characters at a word boundary
     1268    # Note: this regular expression puts a line feed before
     1269    # the last word in each section, even when it is not
     1270    # needed.
     1271    $section =~ s/(.{1,80})\s/$1\n/g;
     1272   
     1273    # fix up the image links
     1274    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
     1275    <center><img src=\"$1\" \/><\/center><br\/>/ig;
     1276    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
     1277    <center><img src=\"$1\" \/><\/center><br\/>/ig;
     1278
     1279    return $section;
     1280}
     1281
     1282# Will convert the oldHDL format to the new HDL format (using the Section tag) 
     1283sub convert_to_newHDLformat
     1284{
     1285    my $self = shift (@_);
     1286    my ($file,$cnfile) = @_;
     1287    my $input_filename = $file;
     1288    my $tmp_filename = $cnfile;
     1289   
     1290    # write HTML tmp file with new HDL format
     1291    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     1292   
     1293    # read in the file and do basic html cleaning (removing header etc)
     1294    my $html = "";
     1295    $self->HB_read_html_file ($input_filename, \$html);
     1296   
     1297    # process the file one section at a time
     1298    my $curtoclevel = 1;
     1299    my $firstsection = 1;
     1300    my $toclevel = 0;
     1301    while (length ($html) > 0) {
     1302    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
     1303        $toclevel = $3;
     1304        my $title = $4;
     1305        my $sectiontext = "";
     1306        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
     1307        $sectiontext = $1;
     1308        } else {
     1309        $sectiontext = $html;
     1310        $html = "";
     1311        }
     1312
     1313        # remove tags and extra spaces from the title
     1314        $title =~ s/<\/?[^>]+>//g;
     1315        $title =~ s/^\s+|\s+$//g;
     1316
     1317        # close any sections below the current level and
     1318        # create a new section (special case for the firstsection)
     1319        print PROD "<!--\n";
     1320        while (($curtoclevel > $toclevel) ||
     1321           (!$firstsection && $curtoclevel == $toclevel)) {
     1322        $curtoclevel--;
     1323        print PROD "</Section>\n";
     1324        }
     1325        if ($curtoclevel+1 < $toclevel) {
     1326        print STDERR "WARNING - jump in toc levels in $input_filename " .
     1327            "from $curtoclevel to $toclevel\n";
     1328        }
     1329        while ($curtoclevel < $toclevel) {
     1330        $curtoclevel++;
     1331        }
     1332
     1333        if ($curtoclevel == 1) {
     1334            # add the header tag
     1335        print PROD "-->\n";
     1336            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
     1337        print PROD "<!--\n";
     1338        }
     1339       
     1340        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
     1341       
     1342        print PROD "-->\n";
     1343       
     1344        # clean up the section html
     1345        $sectiontext = $self->HB_clean_section($sectiontext);
     1346
     1347        print PROD "$sectiontext\n";     
     1348
     1349    } else {
     1350        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
     1351        "\nin $input_filename\n";
     1352        last;
     1353    }
     1354    $firstsection = 0;
     1355    }
     1356   
     1357    print PROD "<!--\n";
     1358    while ($curtoclevel > 0) {
     1359    $curtoclevel--;
     1360    print PROD "</Section>\n";
     1361    }
     1362    print PROD "-->\n";
     1363   
     1364    close (PROD) || die("Error Closing File: $tmp_filename $!");
     1365   
     1366    return $tmp_filename;
     1367}       
     1368
     1369sub shorten {
     1370    my $self = shift (@_);
     1371    my ($text) = @_;
     1372
     1373    return "\"$text\"" if (length($text) < 100);
     1374
     1375    return "\"" . substr ($text, 0, 50) . "\" ... \"" .
     1376    substr ($text, length($text)-50) . "\"";
     1377}
     1378
     1379sub convert_tidy_or_oldHDL_file
     1380{
     1381    my $self = shift (@_);
     1382    my ($file) = @_;
     1383    my $input_filename = $file;
     1384   
     1385    if (-d $input_filename)
     1386    {
     1387        return $input_filename;
     1388    }
     1389   
     1390    # get the input filename
     1391    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     1392    my $base_dirname = $dirname;
     1393    $suffix = lc($suffix);   
     1394   
     1395    # derive tmp filename from input filename
     1396    # Remove any white space from filename -- no risk of name collision, and
     1397    # makes later conversion by utils simpler. Leave spaces in path...
     1398    # tidy up the filename with space, dot, hyphen between
     1399    $tailname =~ s/\s+//g;
     1400    $tailname =~ s/\.+//g;
     1401    $tailname =~ s/\-+//g;
     1402    # convert to utf-8 otherwise we have problems with the doc.xml file
     1403    # later on
     1404    &unicode::ensure_utf8(\$tailname);
     1405   
     1406    # softlink to collection tmp dir
     1407    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
     1408    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
     1409   
     1410    my $test_dirname = "";
     1411    my $f_separator = &util::get_os_dirsep();
     1412   
     1413    if ($dirname =~ m/import$f_separator/)
     1414    {
     1415        $test_dirname = $'; #'
     1416   
     1417    #print STDERR "init $'\n";
     1418   
     1419    while ($test_dirname =~ m/[$f_separator]/)
     1420    {
     1421        my $folderdirname = $`;
     1422        $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
     1423        &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
     1424        $test_dirname = $'; #'
     1425    }
     1426    }
     1427   
     1428    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     1429   
     1430    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
     1431    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
     1432    {   
     1433        #convert the input file to a new style HDL
     1434        my $hdl_output_filename = $input_filename;
     1435        if ($self->{'old_style_HDL'})
     1436        {
     1437        $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     1438        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
     1439        }
     1440   
     1441    #just for checking copy all other file from the base dir to tmp dir if it is not exists
     1442    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
     1443    my @files = grep {!/^\.+$/} readdir(DIR);
     1444    close(DIR);
     1445
     1446    foreach my $file (@files)
     1447    {
     1448        my $src_file = &util::filename_cat($base_dirname,$file);
     1449        my $dest_file = &util::filename_cat($tmp_dirname,$file);
     1450        if ((!-e $dest_file) && (!-d $src_file))
     1451        {
     1452        # just copy the original file back to the tmp directory
     1453        copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
     1454        }
     1455    }
     1456   
     1457    # tidy the input file
     1458    my $tidy_output_filename = $hdl_output_filename;
     1459    if ($self->{'use_realistic_book'})
     1460    {
     1461        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     1462        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
     1463    }
     1464    $tmp_filename = $tidy_output_filename;
     1465    }
     1466    else
     1467    {
     1468        if (!-e $tmp_filename)
     1469    {
     1470        # just copy the original file back to the tmp directory
     1471        copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
     1472    }
     1473    }
     1474   
     1475    return $tmp_filename;
     1476}
     1477
     1478
     1479# Will make the html input file as a proper XML file with removed font tag and
     1480# image size added to the img tag.
     1481# The tidying process takes place in a collection specific 'tmp' directory so
     1482# that we don't accidentally damage the input.
     1483sub tmp_tidy_file
     1484{
     1485    my $self = shift (@_);
     1486    my ($file,$cnfile) = @_;
     1487    my $input_filename = $file;
     1488    my $tmp_filename = $cnfile;
     1489   
     1490    # get the input filename
     1491    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     1492
     1493    require HTML::TokeParser::Simple;
     1494   
     1495    # create HTML parser to decode the input file
     1496    my $parser = HTML::TokeParser::Simple->new($input_filename);
     1497
     1498    # write HTML tmp file without the font tag and image size are added to the img tag
     1499    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     1500    while (my $token = $parser->get_token())
     1501    {
     1502    # is it an img tag
     1503    if ($token->is_start_tag('img'))
     1504    {
     1505        # get the attributes
     1506        my $attr = $token->return_attr;
     1507
     1508        # get the full path to the image
     1509        my $img_file = &util::filename_cat($dirname,$attr->{src});
     1510       
     1511        # set the width and height attribute
     1512        ($attr->{width}, $attr->{height}) = imgsize($img_file);
     1513
     1514        # recreate the tag
     1515        print PROD "<img";
     1516        print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
     1517        print PROD ">";
     1518    }
     1519    # is it a font tag
     1520    else
     1521    {
     1522        if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
     1523        {
     1524        # remove font tag
     1525        print PROD "";
     1526        }
     1527        else
     1528        {
     1529        # print without changes
     1530        print PROD $token->as_is;
     1531        }
     1532    }
     1533    }
     1534    close (PROD) || die("Error Closing File: $tmp_filename $!");
     1535
     1536    # run html-tidy on the tmp file to make it a proper XML file
     1537    my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
     1538
     1539    # write result back to the tmp file
     1540    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
     1541    print PROD $tidyfile;
     1542    close (PROD) || die("Error Closing File: $tmp_filename $!");
     1543   
     1544    # return the output filename
     1545    return $tmp_filename;
     1546}
     1547
    154915481;
Note: See TracChangeset for help on using the changeset viewer.