Changeset 20774

Show
Ignore:
Timestamp:
05.10.2009 15:43:00 (10 years ago)
Author:
kjdon
Message:

moved some of the horrible old methods to the end of the file so that the important ones come first

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/HTMLPlugin.pm

    r20689 r20774  
    124124 
    125125 
    126 sub HB_read_html_file { 
    127     my $self = shift (@_); 
    128     my ($htmlfile, $text) = @_; 
    129      
    130     # load in the file 
    131     if (!open (FILE, $htmlfile)) { 
    132     print STDERR "ERROR - could not open $htmlfile\n"; 
    133     return; 
    134     } 
    135  
    136     my $foundbody = 0; 
    137     $self->HB_gettext (\$foundbody, $text, "FILE"); 
    138     close FILE; 
    139      
    140     # just in case there was no <body> tag 
    141     if (!$foundbody) { 
    142     $foundbody = 1; 
    143     open (FILE, $htmlfile) || return; 
    144     $self->HB_gettext (\$foundbody, $text, "FILE");  
    145     close FILE; 
    146     } 
    147     # text is in utf8 
    148 }        
    149  
    150 # converts the text to utf8, as ghtml does that for &eacute; etc. 
    151 sub HB_gettext { 
    152     my $self = shift (@_); 
    153     my ($foundbody, $text, $handle) = @_; 
    154      
    155     my $line = ""; 
    156     while (defined ($line = <$handle>)) { 
    157     # look for body tag 
    158     if (!$$foundbody) { 
    159         if ($line =~ s/^.*<body[^>]*>//i) { 
    160         $$foundbody = 1; 
    161         } else { 
    162         next; 
    163         } 
    164     } 
    165      
    166     # check for symbol fonts 
    167     if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 
    168         my $font = $1; 
    169         print STDERR "HBPlug::HB_gettext - warning removed font $font\n"  
    170         if ($font !~ m/^arial$/i); 
    171     } 
    172  
    173     $$text .= $line; 
    174     } 
    175  
    176     if ($self->{'input_encoding'} eq "iso_8859_1") { 
    177     # convert to utf-8 
    178     $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 
    179     } 
    180     # convert any alphanumeric character entities to their utf-8 
    181     # equivalent for indexing purposes 
    182     #&ghtml::convertcharentities ($$text); 
    183  
    184     $$text =~ s/\s+/ /g; # remove \n's 
    185 } 
    186  
    187 sub HB_clean_section { 
    188     my $self = shift (@_); 
    189     my ($section) = @_; 
    190  
    191     # remove tags without a starting tag from the section 
    192     my ($tag, $tagstart); 
    193     while ($section =~ m/<\/([^>]{1,10})>/) { 
    194     $tag = $1; 
    195     $tagstart = index($section, "<$tag"); 
    196     last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); 
    197     $section =~ s/<\/$tag>//; 
    198     } 
    199      
    200     # remove extra paragraph tags 
    201     while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} 
    202      
    203     # remove extra stuff at the end of the section 
    204     while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {} 
    205      
    206     # add a newline at the beginning of each paragraph 
    207     $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; 
    208      
    209     # add a newline every 80 characters at a word boundary 
    210     # Note: this regular expression puts a line feed before 
    211     # the last word in each section, even when it is not 
    212     # needed. 
    213     $section =~ s/(.{1,80})\s/$1\n/g; 
    214      
    215     # fix up the image links 
    216     $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ 
    217     <center><img src=\"$1\" \/><\/center><br\/>/ig; 
    218     $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/ 
    219     <center><img src=\"$1\" \/><\/center><br\/>/ig; 
    220  
    221     return $section; 
    222 } 
    223  
    224 # Will convert the oldHDL format to the new HDL format (using the Section tag)   
    225 sub convert_to_newHDLformat 
     126sub new { 
     127    my ($class) = shift (@_); 
     128    my ($pluginlist,$inputargs,$hashArgOptLists) = @_; 
     129    push(@$pluginlist, $class); 
     130     
     131    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
     132    push(@{$hashArgOptLists->{"OptList"}},$options); 
     133     
     134 
     135    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); 
     136     
     137    if ($self->{'w3mir'}) { 
     138    $self->{'file_is_url'} = 1; 
     139    } 
     140    $self->{'aux_files'} = {}; 
     141    $self->{'dir_num'} = 0; 
     142    $self->{'file_num'} = 0; 
     143     
     144    return bless $self, $class; 
     145} 
     146 
     147# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ 
     148# if have eg <script language="javascript" src="img/lib.js@123"> 
     149sub get_default_block_exp { 
     150    my $self = shift (@_); 
     151     
     152    #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^; 
     153    return ""; 
     154} 
     155 
     156sub get_default_process_exp { 
     157    my $self = shift (@_); 
     158     
     159    # the last option is an attempt to encode the concept of an html query ... 
     160    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^; 
     161} 
     162 
     163sub store_block_files 
    226164{ 
    227     my $self = shift (@_); 
    228     my ($file,$cnfile) = @_; 
    229     my $input_filename = $file; 
    230     my $tmp_filename = $cnfile; 
    231      
    232     # write HTML tmp file with new HDL format 
    233     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
    234      
    235     # read in the file and do basic html cleaning (removing header etc) 
    236     my $html = ""; 
    237     $self->HB_read_html_file ($input_filename, \$html); 
    238      
    239     # process the file one section at a time 
    240     my $curtoclevel = 1; 
    241     my $firstsection = 1; 
    242     my $toclevel = 0; 
    243     while (length ($html) > 0) { 
    244     if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) { 
    245         $toclevel = $3; 
    246         my $title = $4; 
    247         my $sectiontext = ""; 
    248         if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) { 
    249         $sectiontext = $1; 
    250         } else { 
    251         $sectiontext = $html; 
    252         $html = ""; 
    253         } 
    254  
    255         # remove tags and extra spaces from the title 
    256         $title =~ s/<\/?[^>]+>//g; 
    257         $title =~ s/^\s+|\s+$//g; 
    258  
    259         # close any sections below the current level and 
    260         # create a new section (special case for the firstsection) 
    261         print PROD "<!--\n"; 
    262         while (($curtoclevel > $toclevel) || 
    263            (!$firstsection && $curtoclevel == $toclevel)) { 
    264         $curtoclevel--; 
    265         print PROD "</Section>\n"; 
    266         } 
    267         if ($curtoclevel+1 < $toclevel) { 
    268         print STDERR "WARNING - jump in toc levels in $input_filename " .  
    269             "from $curtoclevel to $toclevel\n"; 
    270         } 
    271         while ($curtoclevel < $toclevel) { 
    272         $curtoclevel++; 
    273         } 
    274  
    275         if ($curtoclevel == 1) { 
    276             # add the header tag 
    277         print PROD "-->\n"; 
    278             print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n"; 
    279         print PROD "<!--\n"; 
    280         } 
    281          
    282         print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n"; 
    283          
    284         print PROD "-->\n"; 
    285          
    286         # clean up the section html 
    287         $sectiontext = $self->HB_clean_section($sectiontext); 
    288  
    289         print PROD "$sectiontext\n";      
    290  
    291     } else { 
    292         print STDERR "WARNING - leftover text\n" , $self->shorten($html),  
    293         "\nin $input_filename\n"; 
    294         last; 
    295     } 
    296     $firstsection = 0; 
    297     } 
    298      
    299     print PROD "<!--\n"; 
    300     while ($curtoclevel > 0) { 
    301     $curtoclevel--; 
    302     print PROD "</Section>\n"; 
    303     } 
    304     print PROD "-->\n"; 
    305      
    306     close (PROD) || die("Error Closing File: $tmp_filename $!"); 
    307      
    308     return $tmp_filename; 
    309 }        
    310  
    311 sub shorten { 
    312     my $self = shift (@_); 
    313     my ($text) = @_; 
    314  
    315     return "\"$text\"" if (length($text) < 100); 
    316  
    317     return "\"" . substr ($text, 0, 50) . "\" ... \"" .  
    318     substr ($text, length($text)-50) . "\""; 
    319 } 
    320  
    321 sub convert_tidy_or_oldHDL_file 
    322 { 
    323     my $self = shift (@_); 
    324     my ($file) = @_; 
    325     my $input_filename = $file; 
    326      
    327     if (-d $input_filename) 
    328     { 
    329         return $input_filename; 
    330     } 
    331      
    332     # get the input filename 
    333     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
    334     my $base_dirname = $dirname; 
    335     $suffix = lc($suffix);    
    336      
    337     # derive tmp filename from input filename 
    338     # Remove any white space from filename -- no risk of name collision, and 
    339     # makes later conversion by utils simpler. Leave spaces in path... 
    340     # tidy up the filename with space, dot, hyphen between 
    341     $tailname =~ s/\s+//g;  
    342     $tailname =~ s/\.+//g; 
    343     $tailname =~ s/\-+//g; 
    344     # convert to utf-8 otherwise we have problems with the doc.xml file 
    345     # later on 
    346     &unicode::ensure_utf8(\$tailname); 
    347      
    348     # softlink to collection tmp dir 
    349     my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); 
    350     &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 
    351      
    352     my $test_dirname = ""; 
    353     my $f_separator = &util::get_os_dirsep(); 
    354      
    355     if ($dirname =~ m/import$f_separator/) 
    356     { 
    357         $test_dirname = $'; #' 
    358      
    359     #print STDERR "init $'\n"; 
    360      
    361     while ($test_dirname =~ m/[$f_separator]/) 
    362     { 
    363         my $folderdirname = $`; 
    364         $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 
    365         &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 
    366         $test_dirname = $'; #' 
    367     } 
    368     } 
    369      
    370     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
    371      
    372     # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp 
    373     if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 
    374     {    
    375         #convert the input file to a new style HDL 
    376         my $hdl_output_filename = $input_filename; 
    377         if ($self->{'old_style_HDL'}) 
    378         { 
    379         $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
    380         $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); 
    381         } 
    382      
    383     #just for checking copy all other file from the base dir to tmp dir if it is not exists 
    384     opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 
    385     my @files = grep {!/^\.+$/} readdir(DIR); 
    386     close(DIR); 
    387  
    388     foreach my $file (@files)  
    389     { 
    390         my $src_file = &util::filename_cat($base_dirname,$file); 
    391         my $dest_file = &util::filename_cat($tmp_dirname,$file); 
    392         if ((!-e $dest_file) && (!-d $src_file)) 
    393         { 
    394         # just copy the original file back to the tmp directory 
    395         copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; 
    396         } 
    397     } 
    398      
    399     # tidy the input file 
    400     my $tidy_output_filename = $hdl_output_filename; 
    401     if ($self->{'use_realistic_book'}) 
    402     { 
    403         $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
    404         $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); 
    405     } 
    406     $tmp_filename = $tidy_output_filename; 
    407     } 
    408     else 
    409     { 
    410         if (!-e $tmp_filename) 
    411     { 
    412         # just copy the original file back to the tmp directory 
    413         copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; 
    414     } 
    415     } 
    416      
    417     return $tmp_filename; 
    418 } 
    419  
    420  
    421 # Will make the html input file as a proper XML file with removed font tag and 
    422 # image size added to the img tag. 
    423 # The tidying process takes place in a collection specific 'tmp' directory so  
    424 # that we don't accidentally damage the input. 
    425 sub tmp_tidy_file  
    426 { 
    427     my $self = shift (@_); 
    428     my ($file,$cnfile) = @_; 
    429     my $input_filename = $file; 
    430     my $tmp_filename = $cnfile; 
    431      
    432     # get the input filename 
    433     my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
    434  
    435     require HTML::TokeParser::Simple; 
    436      
    437     # create HTML parser to decode the input file 
    438     my $parser = HTML::TokeParser::Simple->new($input_filename); 
    439  
    440     # write HTML tmp file without the font tag and image size are added to the img tag 
    441     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
    442     while (my $token = $parser->get_token()) 
    443     { 
    444     # is it an img tag 
    445     if ($token->is_start_tag('img')) 
    446     { 
    447         # get the attributes 
    448         my $attr = $token->return_attr; 
    449  
    450         # get the full path to the image 
    451         my $img_file = &util::filename_cat($dirname,$attr->{src}); 
    452          
    453         # set the width and height attribute 
    454         ($attr->{width}, $attr->{height}) = imgsize($img_file); 
    455  
    456         # recreate the tag 
    457         print PROD "<img"; 
    458         print PROD map { qq { $_="$attr->{$_}"} } keys %$attr; 
    459         print PROD ">"; 
    460     } 
    461     # is it a font tag 
    462     else  
    463     { 
    464         if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) 
    465         { 
    466         # remove font tag 
    467         print PROD ""; 
    468         } 
    469         else 
    470         { 
    471         # print without changes 
    472         print PROD $token->as_is; 
    473         } 
    474     } 
    475     } 
    476     close (PROD) || die("Error Closing File: $tmp_filename $!"); 
    477  
    478     # run html-tidy on the tmp file to make it a proper XML file 
    479     my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`; 
    480  
    481     # write result back to the tmp file 
    482     open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
    483     print PROD $tidyfile; 
    484     close (PROD) || die("Error Closing File: $tmp_filename $!"); 
    485      
    486     # return the output filename 
    487     return $tmp_filename; 
    488 }  
     165    my $self =shift (@_); 
     166    my ($filename_full_path, $block_hash) = @_; 
     167 
     168    my $html_fname = $filename_full_path; 
     169    my @file_blocks; 
     170     
     171    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
     172 
     173    # read in file ($text will be in utf8) 
     174    my $raw_text = ""; 
     175    $self->read_file_no_decoding ($filename_full_path, \$raw_text); 
     176 
     177    my $textref = \$raw_text; 
     178    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))'; 
     179    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)'; 
     180    $$textref =~ s/$opencom(.*?)$closecom//gs; 
     181 
     182    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; 
     183    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
     184    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs); 
     185    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs); 
     186    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
     187    my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs); 
     188    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
     189 
     190    if(!defined $self->{'utf8_to_original_filename'}) {  
     191    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 
     192    $self->{'utf8_to_original_filename'} = {}; 
     193    } 
     194 
     195    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 
     196 
     197    # remove quotes from link at start and end if necessary 
     198    if ($link=~/^\"/) { 
     199        $link=~s/^\"//; 
     200        $link=~s/\"$//; 
     201    } 
     202 
     203    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 
     204    # some links may just be anchor names 
     205    next unless ($link =~ /\S+/); 
     206 
     207    if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) { 
     208        # Turn relative file path into full path 
     209        my $dirname = &File::Basename::dirname($filename_full_path); 
     210        $link = &util::filename_cat($dirname, $link); 
     211    } 
     212    $link = $self->eval_dir_dots($link); 
     213 
     214    # this is the actual filename on the filesystem (that the link refers to) 
     215    my $url_original_filename = $self->opt_url_decode($link); 
     216 
     217    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 
     218    my $utf8_link = ""; 
     219    $self->decode_text($link,$encoding,$language,\$utf8_link); 
     220 
     221    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 
     222#   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 
     223 
     224    if ($url_original_filename ne $utf8_link) { 
     225        my $outhandle = $self->{'outhandle'}; 
     226 
     227        print $outhandle "URL Encoding $url_original_filename\n"; 
     228        print $outhandle " ->$utf8_link\n"; 
     229    } 
     230 
     231    $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 
     232    } 
     233} 
     234 
     235# Given a filename in any encoding, will URL decode it to get back the original filename 
     236# in the original encoding. Because this method is intended to work out the *original*  
     237# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 
     238# string already exists in the local folder. 
     239# Return the original filename corresponding to the parameter URL-encoded filename, and 
     240# a decoded flag that is set to true iff URL-decoding had to be applied. 
     241sub opt_url_decode { 
     242    my $self = shift (@_); 
     243    my ($link) = @_; 
     244 
     245    # Replace %XX's in URL with decoded value if required. 
     246    # Note that the filename may include the %XX in some situations 
     247    if ($link =~ m/\%[A-F0-9]{2}/i) { 
     248    if (!-e $link) { 
     249        $link = &unicode::url_decode($link); 
     250    } 
     251    }  
     252 
     253    return $link; 
     254} 
    489255 
    490256sub read_into_doc_obj  
     
    523289    return ($process_status,$doc_obj); 
    524290} 
    525  
    526 sub new { 
    527     my ($class) = shift (@_); 
    528     my ($pluginlist,$inputargs,$hashArgOptLists) = @_; 
    529     push(@$pluginlist, $class); 
    530      
    531     push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 
    532     push(@{$hashArgOptLists->{"OptList"}},$options); 
    533      
    534  
    535     my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); 
    536      
    537     if ($self->{'w3mir'}) { 
    538     $self->{'file_is_url'} = 1; 
    539     } 
    540     $self->{'aux_files'} = {}; 
    541     $self->{'dir_num'} = 0; 
    542     $self->{'file_num'} = 0; 
    543      
    544     return bless $self, $class; 
    545 } 
    546  
    547 # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ 
    548 # if have eg <script language="javascript" src="img/lib.js@123"> 
    549 sub get_default_block_exp { 
    550     my $self = shift (@_); 
    551      
    552     #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^; 
    553     return ""; 
    554 } 
    555  
    556 sub get_default_process_exp { 
    557     my $self = shift (@_); 
    558      
    559     # the last option is an attempt to encode the concept of an html query ... 
    560     return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^; 
    561 } 
    562  
    563 sub store_block_files 
    564 { 
    565     my $self =shift (@_); 
    566     my ($filename_full_path, $block_hash) = @_; 
    567  
    568     my $html_fname = $filename_full_path; 
    569     my @file_blocks; 
    570      
    571     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
    572  
    573     # read in file ($text will be in utf8) 
    574     my $raw_text = ""; 
    575     $self->read_file_no_decoding ($filename_full_path, \$raw_text); 
    576  
    577     my $textref = \$raw_text; 
    578     my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))'; 
    579     my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)'; 
    580     $$textref =~ s/$opencom(.*?)$closecom//gs; 
    581  
    582     my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; 
    583     my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
    584     my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs); 
    585     my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs); 
    586     my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
    587     my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs); 
    588     my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
    589  
    590     if(!defined $self->{'utf8_to_original_filename'}) {  
    591     # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 
    592     $self->{'utf8_to_original_filename'} = {}; 
    593     } 
    594  
    595     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 
    596  
    597     # remove quotes from link at start and end if necessary 
    598     if ($link=~/^\"/) { 
    599         $link=~s/^\"//; 
    600         $link=~s/\"$//; 
    601     } 
    602  
    603     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 
    604     # some links may just be anchor names 
    605     next unless ($link =~ /\S+/); 
    606  
    607     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) { 
    608         # Turn relative file path into full path 
    609         my $dirname = &File::Basename::dirname($filename_full_path); 
    610         $link = &util::filename_cat($dirname, $link); 
    611     } 
    612     $link = $self->eval_dir_dots($link); 
    613  
    614     # this is the actual filename on the filesystem (that the link refers to) 
    615     my $url_original_filename = $self->opt_url_decode($link); 
    616  
    617     # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 
    618     my $utf8_link = ""; 
    619     $self->decode_text($link,$encoding,$language,\$utf8_link); 
    620  
    621     $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 
    622 #   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 
    623  
    624     if ($url_original_filename ne $utf8_link) { 
    625         my $outhandle = $self->{'outhandle'}; 
    626  
    627         print $outhandle "URL Encoding $url_original_filename\n"; 
    628         print $outhandle " ->$utf8_link\n"; 
    629     } 
    630  
    631     $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 
    632     } 
    633 } 
    634  
    635 # Given a filename in any encoding, will URL decode it to get back the original filename 
    636 # in the original encoding. Because this method is intended to work out the *original*  
    637 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 
    638 # string already exists in the local folder. 
    639 # Return the original filename corresponding to the parameter URL-encoded filename, and 
    640 # a decoded flag that is set to true iff URL-decoding had to be applied. 
    641 sub opt_url_decode { 
    642     my $self = shift (@_); 
    643     my ($link) = @_; 
    644  
    645     # Replace %XX's in URL with decoded value if required. 
    646     # Note that the filename may include the %XX in some situations 
    647     if ($link =~ m/\%[A-F0-9]{2}/i) { 
    648     if (!-e $link) { 
    649         $link = &unicode::url_decode($link); 
    650     } 
    651     }  
    652  
    653     return $link; 
    654 } 
    655  
    656291 
    657292# do plugin specific processing of doc_obj 
     
    15471182} 
    15481183 
     1184sub HB_read_html_file { 
     1185    my $self = shift (@_); 
     1186    my ($htmlfile, $text) = @_; 
     1187     
     1188    # load in the file 
     1189    if (!open (FILE, $htmlfile)) { 
     1190    print STDERR "ERROR - could not open $htmlfile\n"; 
     1191    return; 
     1192    } 
     1193 
     1194    my $foundbody = 0; 
     1195    $self->HB_gettext (\$foundbody, $text, "FILE"); 
     1196    close FILE; 
     1197     
     1198    # just in case there was no <body> tag 
     1199    if (!$foundbody) { 
     1200    $foundbody = 1; 
     1201    open (FILE, $htmlfile) || return; 
     1202    $self->HB_gettext (\$foundbody, $text, "FILE");  
     1203    close FILE; 
     1204    } 
     1205    # text is in utf8 
     1206}        
     1207 
     1208# converts the text to utf8, as ghtml does that for &eacute; etc. 
     1209sub HB_gettext { 
     1210    my $self = shift (@_); 
     1211    my ($foundbody, $text, $handle) = @_; 
     1212     
     1213    my $line = ""; 
     1214    while (defined ($line = <$handle>)) { 
     1215    # look for body tag 
     1216    if (!$$foundbody) { 
     1217        if ($line =~ s/^.*<body[^>]*>//i) { 
     1218        $$foundbody = 1; 
     1219        } else { 
     1220        next; 
     1221        } 
     1222    } 
     1223     
     1224    # check for symbol fonts 
     1225    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 
     1226        my $font = $1; 
     1227        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"  
     1228        if ($font !~ m/^arial$/i); 
     1229    } 
     1230 
     1231    $$text .= $line; 
     1232    } 
     1233 
     1234    if ($self->{'input_encoding'} eq "iso_8859_1") { 
     1235    # convert to utf-8 
     1236    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 
     1237    } 
     1238    # convert any alphanumeric character entities to their utf-8 
     1239    # equivalent for indexing purposes 
     1240    #&ghtml::convertcharentities ($$text); 
     1241 
     1242    $$text =~ s/\s+/ /g; # remove \n's 
     1243} 
     1244 
     1245sub HB_clean_section { 
     1246    my $self = shift (@_); 
     1247    my ($section) = @_; 
     1248 
     1249    # remove tags without a starting tag from the section 
     1250    my ($tag, $tagstart); 
     1251    while ($section =~ m/<\/([^>]{1,10})>/) { 
     1252    $tag = $1; 
     1253    $tagstart = index($section, "<$tag"); 
     1254    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); 
     1255    $section =~ s/<\/$tag>//; 
     1256    } 
     1257     
     1258    # remove extra paragraph tags 
     1259    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} 
     1260     
     1261    # remove extra stuff at the end of the section 
     1262    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {} 
     1263     
     1264    # add a newline at the beginning of each paragraph 
     1265    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; 
     1266     
     1267    # add a newline every 80 characters at a word boundary 
     1268    # Note: this regular expression puts a line feed before 
     1269    # the last word in each section, even when it is not 
     1270    # needed. 
     1271    $section =~ s/(.{1,80})\s/$1\n/g; 
     1272     
     1273    # fix up the image links 
     1274    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ 
     1275    <center><img src=\"$1\" \/><\/center><br\/>/ig; 
     1276    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/ 
     1277    <center><img src=\"$1\" \/><\/center><br\/>/ig; 
     1278 
     1279    return $section; 
     1280} 
     1281 
     1282# Will convert the oldHDL format to the new HDL format (using the Section tag)   
     1283sub convert_to_newHDLformat 
     1284{ 
     1285    my $self = shift (@_); 
     1286    my ($file,$cnfile) = @_; 
     1287    my $input_filename = $file; 
     1288    my $tmp_filename = $cnfile; 
     1289     
     1290    # write HTML tmp file with new HDL format 
     1291    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
     1292     
     1293    # read in the file and do basic html cleaning (removing header etc) 
     1294    my $html = ""; 
     1295    $self->HB_read_html_file ($input_filename, \$html); 
     1296     
     1297    # process the file one section at a time 
     1298    my $curtoclevel = 1; 
     1299    my $firstsection = 1; 
     1300    my $toclevel = 0; 
     1301    while (length ($html) > 0) { 
     1302    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) { 
     1303        $toclevel = $3; 
     1304        my $title = $4; 
     1305        my $sectiontext = ""; 
     1306        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) { 
     1307        $sectiontext = $1; 
     1308        } else { 
     1309        $sectiontext = $html; 
     1310        $html = ""; 
     1311        } 
     1312 
     1313        # remove tags and extra spaces from the title 
     1314        $title =~ s/<\/?[^>]+>//g; 
     1315        $title =~ s/^\s+|\s+$//g; 
     1316 
     1317        # close any sections below the current level and 
     1318        # create a new section (special case for the firstsection) 
     1319        print PROD "<!--\n"; 
     1320        while (($curtoclevel > $toclevel) || 
     1321           (!$firstsection && $curtoclevel == $toclevel)) { 
     1322        $curtoclevel--; 
     1323        print PROD "</Section>\n"; 
     1324        } 
     1325        if ($curtoclevel+1 < $toclevel) { 
     1326        print STDERR "WARNING - jump in toc levels in $input_filename " .  
     1327            "from $curtoclevel to $toclevel\n"; 
     1328        } 
     1329        while ($curtoclevel < $toclevel) { 
     1330        $curtoclevel++; 
     1331        } 
     1332 
     1333        if ($curtoclevel == 1) { 
     1334            # add the header tag 
     1335        print PROD "-->\n"; 
     1336            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n"; 
     1337        print PROD "<!--\n"; 
     1338        } 
     1339         
     1340        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n"; 
     1341         
     1342        print PROD "-->\n"; 
     1343         
     1344        # clean up the section html 
     1345        $sectiontext = $self->HB_clean_section($sectiontext); 
     1346 
     1347        print PROD "$sectiontext\n";      
     1348 
     1349    } else { 
     1350        print STDERR "WARNING - leftover text\n" , $self->shorten($html),  
     1351        "\nin $input_filename\n"; 
     1352        last; 
     1353    } 
     1354    $firstsection = 0; 
     1355    } 
     1356     
     1357    print PROD "<!--\n"; 
     1358    while ($curtoclevel > 0) { 
     1359    $curtoclevel--; 
     1360    print PROD "</Section>\n"; 
     1361    } 
     1362    print PROD "-->\n"; 
     1363     
     1364    close (PROD) || die("Error Closing File: $tmp_filename $!"); 
     1365     
     1366    return $tmp_filename; 
     1367}        
     1368 
     1369sub shorten { 
     1370    my $self = shift (@_); 
     1371    my ($text) = @_; 
     1372 
     1373    return "\"$text\"" if (length($text) < 100); 
     1374 
     1375    return "\"" . substr ($text, 0, 50) . "\" ... \"" .  
     1376    substr ($text, length($text)-50) . "\""; 
     1377} 
     1378 
     1379sub convert_tidy_or_oldHDL_file 
     1380{ 
     1381    my $self = shift (@_); 
     1382    my ($file) = @_; 
     1383    my $input_filename = $file; 
     1384     
     1385    if (-d $input_filename) 
     1386    { 
     1387        return $input_filename; 
     1388    } 
     1389     
     1390    # get the input filename 
     1391    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
     1392    my $base_dirname = $dirname; 
     1393    $suffix = lc($suffix);    
     1394     
     1395    # derive tmp filename from input filename 
     1396    # Remove any white space from filename -- no risk of name collision, and 
     1397    # makes later conversion by utils simpler. Leave spaces in path... 
     1398    # tidy up the filename with space, dot, hyphen between 
     1399    $tailname =~ s/\s+//g;  
     1400    $tailname =~ s/\.+//g; 
     1401    $tailname =~ s/\-+//g; 
     1402    # convert to utf-8 otherwise we have problems with the doc.xml file 
     1403    # later on 
     1404    &unicode::ensure_utf8(\$tailname); 
     1405     
     1406    # softlink to collection tmp dir 
     1407    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); 
     1408    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 
     1409     
     1410    my $test_dirname = ""; 
     1411    my $f_separator = &util::get_os_dirsep(); 
     1412     
     1413    if ($dirname =~ m/import$f_separator/) 
     1414    { 
     1415        $test_dirname = $'; #' 
     1416     
     1417    #print STDERR "init $'\n"; 
     1418     
     1419    while ($test_dirname =~ m/[$f_separator]/) 
     1420    { 
     1421        my $folderdirname = $`; 
     1422        $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 
     1423        &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 
     1424        $test_dirname = $'; #' 
     1425    } 
     1426    } 
     1427     
     1428    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
     1429     
     1430    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp 
     1431    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 
     1432    {    
     1433        #convert the input file to a new style HDL 
     1434        my $hdl_output_filename = $input_filename; 
     1435        if ($self->{'old_style_HDL'}) 
     1436        { 
     1437        $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
     1438        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); 
     1439        } 
     1440     
     1441    #just for checking copy all other file from the base dir to tmp dir if it is not exists 
     1442    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 
     1443    my @files = grep {!/^\.+$/} readdir(DIR); 
     1444    close(DIR); 
     1445 
     1446    foreach my $file (@files)  
     1447    { 
     1448        my $src_file = &util::filename_cat($base_dirname,$file); 
     1449        my $dest_file = &util::filename_cat($tmp_dirname,$file); 
     1450        if ((!-e $dest_file) && (!-d $src_file)) 
     1451        { 
     1452        # just copy the original file back to the tmp directory 
     1453        copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; 
     1454        } 
     1455    } 
     1456     
     1457    # tidy the input file 
     1458    my $tidy_output_filename = $hdl_output_filename; 
     1459    if ($self->{'use_realistic_book'}) 
     1460    { 
     1461        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
     1462        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); 
     1463    } 
     1464    $tmp_filename = $tidy_output_filename; 
     1465    } 
     1466    else 
     1467    { 
     1468        if (!-e $tmp_filename) 
     1469    { 
     1470        # just copy the original file back to the tmp directory 
     1471        copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; 
     1472    } 
     1473    } 
     1474     
     1475    return $tmp_filename; 
     1476} 
     1477 
     1478 
     1479# Will make the html input file as a proper XML file with removed font tag and 
     1480# image size added to the img tag. 
     1481# The tidying process takes place in a collection specific 'tmp' directory so  
     1482# that we don't accidentally damage the input. 
     1483sub tmp_tidy_file  
     1484{ 
     1485    my $self = shift (@_); 
     1486    my ($file,$cnfile) = @_; 
     1487    my $input_filename = $file; 
     1488    my $tmp_filename = $cnfile; 
     1489     
     1490    # get the input filename 
     1491    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
     1492 
     1493    require HTML::TokeParser::Simple; 
     1494     
     1495    # create HTML parser to decode the input file 
     1496    my $parser = HTML::TokeParser::Simple->new($input_filename); 
     1497 
     1498    # write HTML tmp file without the font tag and image size are added to the img tag 
     1499    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
     1500    while (my $token = $parser->get_token()) 
     1501    { 
     1502    # is it an img tag 
     1503    if ($token->is_start_tag('img')) 
     1504    { 
     1505        # get the attributes 
     1506        my $attr = $token->return_attr; 
     1507 
     1508        # get the full path to the image 
     1509        my $img_file = &util::filename_cat($dirname,$attr->{src}); 
     1510         
     1511        # set the width and height attribute 
     1512        ($attr->{width}, $attr->{height}) = imgsize($img_file); 
     1513 
     1514        # recreate the tag 
     1515        print PROD "<img"; 
     1516        print PROD map { qq { $_="$attr->{$_}"} } keys %$attr; 
     1517        print PROD ">"; 
     1518    } 
     1519    # is it a font tag 
     1520    else  
     1521    { 
     1522        if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) 
     1523        { 
     1524        # remove font tag 
     1525        print PROD ""; 
     1526        } 
     1527        else 
     1528        { 
     1529        # print without changes 
     1530        print PROD $token->as_is; 
     1531        } 
     1532    } 
     1533    } 
     1534    close (PROD) || die("Error Closing File: $tmp_filename $!"); 
     1535 
     1536    # run html-tidy on the tmp file to make it a proper XML file 
     1537    my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`; 
     1538 
     1539    # write result back to the tmp file 
     1540    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 
     1541    print PROD $tidyfile; 
     1542    close (PROD) || die("Error Closing File: $tmp_filename $!"); 
     1543     
     1544    # return the output filename 
     1545    return $tmp_filename; 
     1546}  
     1547 
    154915481;