Changeset 20774 for gsdl/trunk/perllib/plugins/HTMLPlugin.pm
- Timestamp:
- 2009-10-05T15:43:00+13:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/HTMLPlugin.pm
r20689 r20774 124 124 125 125 126 sub HB_read_html_file { 127 my $self = shift (@_); 128 my ($htmlfile, $text) = @_; 129 130 # load in the file 131 if (!open (FILE, $htmlfile)) { 132 print STDERR "ERROR - could not open $htmlfile\n"; 133 return; 134 } 135 136 my $foundbody = 0; 137 $self->HB_gettext (\$foundbody, $text, "FILE"); 138 close FILE; 139 140 # just in case there was no <body> tag 141 if (!$foundbody) { 142 $foundbody = 1; 143 open (FILE, $htmlfile) || return; 144 $self->HB_gettext (\$foundbody, $text, "FILE"); 145 close FILE; 146 } 147 # text is in utf8 148 } 149 150 # converts the text to utf8, as ghtml does that for é etc. 151 sub HB_gettext { 152 my $self = shift (@_); 153 my ($foundbody, $text, $handle) = @_; 154 155 my $line = ""; 156 while (defined ($line = <$handle>)) { 157 # look for body tag 158 if (!$$foundbody) { 159 if ($line =~ s/^.*<body[^>]*>//i) { 160 $$foundbody = 1; 161 } else { 162 next; 163 } 164 } 165 166 # check for symbol fonts 167 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 168 my $font = $1; 169 print STDERR "HBPlug::HB_gettext - warning removed font $font\n" 170 if ($font !~ m/^arial$/i); 171 } 172 173 $$text .= $line; 174 } 175 176 if ($self->{'input_encoding'} eq "iso_8859_1") { 177 # convert to utf-8 178 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 179 } 180 # convert any alphanumeric character entities to their utf-8 181 # equivalent for indexing purposes 182 #&ghtml::convertcharentities ($$text); 183 184 $$text =~ s/\s+/ /g; # remove \n's 185 } 186 187 sub HB_clean_section { 188 my $self = shift (@_); 189 my ($section) = @_; 190 191 # remove tags without a starting tag from the section 192 my ($tag, $tagstart); 193 while ($section =~ m/<\/([^>]{1,10})>/) { 194 $tag = $1; 195 $tagstart = index($section, "<$tag"); 196 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); 197 $section =~ s/<\/$tag>//; 198 } 199 200 # remove extra paragraph tags 201 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} 202 203 # remove extra stuff at the end of the section 204 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>| |\s)$//i) {} 205 206 # add a newline at the beginning of each paragraph 207 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; 208 209 # add a newline every 80 characters at a word boundary 210 # Note: this regular expression puts a line feed before 211 # the last word in each section, even when it is not 212 # needed. 213 $section =~ s/(.{1,80})\s/$1\n/g; 214 215 # fix up the image links 216 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ 217 <center><img src=\"$1\" \/><\/center><br\/>/ig; 218 $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/ 219 <center><img src=\"$1\" \/><\/center><br\/>/ig; 220 221 return $section; 222 } 223 224 # Will convert the oldHDL format to the new HDL format (using the Section tag) 225 sub convert_to_newHDLformat 126 sub new { 127 my ($class) = shift (@_); 128 my ($pluginlist,$inputargs,$hashArgOptLists) = @_; 129 push(@$pluginlist, $class); 130 131 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 132 push(@{$hashArgOptLists->{"OptList"}},$options); 133 134 135 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists); 136 137 if ($self->{'w3mir'}) { 138 $self->{'file_is_url'} = 1; 139 } 140 $self->{'aux_files'} = {}; 141 $self->{'dir_num'} = 0; 142 $self->{'file_num'} = 0; 143 144 return bless $self, $class; 145 } 146 147 # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$ 148 # if have eg <script language="javascript" src="img/lib.js@123"> 149 sub get_default_block_exp { 150 my $self = shift (@_); 151 152 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^; 153 return ""; 154 } 155 156 sub get_default_process_exp { 157 my $self = shift (@_); 158 159 # the last option is an attempt to encode the concept of an html query ... 160 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^; 161 } 162 163 sub store_block_files 226 164 { 227 my $self = shift (@_); 228 my ($file,$cnfile) = @_; 229 my $input_filename = $file; 230 my $tmp_filename = $cnfile; 231 232 # write HTML tmp file with new HDL format 233 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 234 235 # read in the file and do basic html cleaning (removing header etc) 236 my $html = ""; 237 $self->HB_read_html_file ($input_filename, \$html); 238 239 # process the file one section at a time 240 my $curtoclevel = 1; 241 my $firstsection = 1; 242 my $toclevel = 0; 243 while (length ($html) > 0) { 244 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC(\d+)>>\s*(.*?)<p\b/<p/i) { 245 $toclevel = $3; 246 my $title = $4; 247 my $sectiontext = ""; 248 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC\d+>>)/$2/i) { 249 $sectiontext = $1; 250 } else { 251 $sectiontext = $html; 252 $html = ""; 253 } 254 255 # remove tags and extra spaces from the title 256 $title =~ s/<\/?[^>]+>//g; 257 $title =~ s/^\s+|\s+$//g; 258 259 # close any sections below the current level and 260 # create a new section (special case for the firstsection) 261 print PROD "<!--\n"; 262 while (($curtoclevel > $toclevel) || 263 (!$firstsection && $curtoclevel == $toclevel)) { 264 $curtoclevel--; 265 print PROD "</Section>\n"; 266 } 267 if ($curtoclevel+1 < $toclevel) { 268 print STDERR "WARNING - jump in toc levels in $input_filename " . 269 "from $curtoclevel to $toclevel\n"; 270 } 271 while ($curtoclevel < $toclevel) { 272 $curtoclevel++; 273 } 274 275 if ($curtoclevel == 1) { 276 # add the header tag 277 print PROD "-->\n"; 278 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n"; 279 print PROD "<!--\n"; 280 } 281 282 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n"; 283 284 print PROD "-->\n"; 285 286 # clean up the section html 287 $sectiontext = $self->HB_clean_section($sectiontext); 288 289 print PROD "$sectiontext\n"; 290 291 } else { 292 print STDERR "WARNING - leftover text\n" , $self->shorten($html), 293 "\nin $input_filename\n"; 294 last; 295 } 296 $firstsection = 0; 297 } 298 299 print PROD "<!--\n"; 300 while ($curtoclevel > 0) { 301 $curtoclevel--; 302 print PROD "</Section>\n"; 303 } 304 print PROD "-->\n"; 305 306 close (PROD) || die("Error Closing File: $tmp_filename $!"); 307 308 return $tmp_filename; 309 } 310 311 sub shorten { 312 my $self = shift (@_); 313 my ($text) = @_; 314 315 return "\"$text\"" if (length($text) < 100); 316 317 return "\"" . substr ($text, 0, 50) . "\" ... \"" . 318 substr ($text, length($text)-50) . "\""; 319 } 320 321 sub convert_tidy_or_oldHDL_file 322 { 323 my $self = shift (@_); 324 my ($file) = @_; 325 my $input_filename = $file; 326 327 if (-d $input_filename) 328 { 329 return $input_filename; 330 } 331 332 # get the input filename 333 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 334 my $base_dirname = $dirname; 335 $suffix = lc($suffix); 336 337 # derive tmp filename from input filename 338 # Remove any white space from filename -- no risk of name collision, and 339 # makes later conversion by utils simpler. Leave spaces in path... 340 # tidy up the filename with space, dot, hyphen between 341 $tailname =~ s/\s+//g; 342 $tailname =~ s/\.+//g; 343 $tailname =~ s/\-+//g; 344 # convert to utf-8 otherwise we have problems with the doc.xml file 345 # later on 346 &unicode::ensure_utf8(\$tailname); 347 348 # softlink to collection tmp dir 349 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); 350 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 351 352 my $test_dirname = ""; 353 my $f_separator = &util::get_os_dirsep(); 354 355 if ($dirname =~ m/import$f_separator/) 356 { 357 $test_dirname = $'; #' 358 359 #print STDERR "init $'\n"; 360 361 while ($test_dirname =~ m/[$f_separator]/) 362 { 363 my $folderdirname = $`; 364 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 365 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 366 $test_dirname = $'; #' 367 } 368 } 369 370 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 371 372 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp 373 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 374 { 375 #convert the input file to a new style HDL 376 my $hdl_output_filename = $input_filename; 377 if ($self->{'old_style_HDL'}) 378 { 379 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 380 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); 381 } 382 383 #just for checking copy all other file from the base dir to tmp dir if it is not exists 384 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 385 my @files = grep {!/^\.+$/} readdir(DIR); 386 close(DIR); 387 388 foreach my $file (@files) 389 { 390 my $src_file = &util::filename_cat($base_dirname,$file); 391 my $dest_file = &util::filename_cat($tmp_dirname,$file); 392 if ((!-e $dest_file) && (!-d $src_file)) 393 { 394 # just copy the original file back to the tmp directory 395 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; 396 } 397 } 398 399 # tidy the input file 400 my $tidy_output_filename = $hdl_output_filename; 401 if ($self->{'use_realistic_book'}) 402 { 403 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 404 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); 405 } 406 $tmp_filename = $tidy_output_filename; 407 } 408 else 409 { 410 if (!-e $tmp_filename) 411 { 412 # just copy the original file back to the tmp directory 413 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; 414 } 415 } 416 417 return $tmp_filename; 418 } 419 420 421 # Will make the html input file as a proper XML file with removed font tag and 422 # image size added to the img tag. 423 # The tidying process takes place in a collection specific 'tmp' directory so 424 # that we don't accidentally damage the input. 425 sub tmp_tidy_file 426 { 427 my $self = shift (@_); 428 my ($file,$cnfile) = @_; 429 my $input_filename = $file; 430 my $tmp_filename = $cnfile; 431 432 # get the input filename 433 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 434 435 require HTML::TokeParser::Simple; 436 437 # create HTML parser to decode the input file 438 my $parser = HTML::TokeParser::Simple->new($input_filename); 439 440 # write HTML tmp file without the font tag and image size are added to the img tag 441 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 442 while (my $token = $parser->get_token()) 443 { 444 # is it an img tag 445 if ($token->is_start_tag('img')) 446 { 447 # get the attributes 448 my $attr = $token->return_attr; 449 450 # get the full path to the image 451 my $img_file = &util::filename_cat($dirname,$attr->{src}); 452 453 # set the width and height attribute 454 ($attr->{width}, $attr->{height}) = imgsize($img_file); 455 456 # recreate the tag 457 print PROD "<img"; 458 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr; 459 print PROD ">"; 460 } 461 # is it a font tag 462 else 463 { 464 if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) 465 { 466 # remove font tag 467 print PROD ""; 468 } 469 else 470 { 471 # print without changes 472 print PROD $token->as_is; 473 } 474 } 475 } 476 close (PROD) || die("Error Closing File: $tmp_filename $!"); 477 478 # run html-tidy on the tmp file to make it a proper XML file 479 my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`; 480 481 # write result back to the tmp file 482 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 483 print PROD $tidyfile; 484 close (PROD) || die("Error Closing File: $tmp_filename $!"); 485 486 # return the output filename 487 return $tmp_filename; 488 } 165 my $self =shift (@_); 166 my ($filename_full_path, $block_hash) = @_; 167 168 my $html_fname = $filename_full_path; 169 my @file_blocks; 170 171 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 172 173 # read in file ($text will be in utf8) 174 my $raw_text = ""; 175 $self->read_file_no_decoding ($filename_full_path, \$raw_text); 176 177 my $textref = \$raw_text; 178 my $opencom = '(?:<!--|<!(?:—|—|--))'; 179 my $closecom = '(?:-->|(?:—|—|--)>)'; 180 $$textref =~ s/$opencom(.*?)$closecom//gs; 181 182 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; 183 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs); 184 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs); 185 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs); 186 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs); 187 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs); 188 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 189 190 if(!defined $self->{'utf8_to_original_filename'}) { 191 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 192 $self->{'utf8_to_original_filename'} = {}; 193 } 194 195 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 196 197 # remove quotes from link at start and end if necessary 198 if ($link=~/^\"/) { 199 $link=~s/^\"//; 200 $link=~s/\"$//; 201 } 202 203 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 204 # some links may just be anchor names 205 next unless ($link =~ /\S+/); 206 207 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) { 208 # Turn relative file path into full path 209 my $dirname = &File::Basename::dirname($filename_full_path); 210 $link = &util::filename_cat($dirname, $link); 211 } 212 $link = $self->eval_dir_dots($link); 213 214 # this is the actual filename on the filesystem (that the link refers to) 215 my $url_original_filename = $self->opt_url_decode($link); 216 217 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 218 my $utf8_link = ""; 219 $self->decode_text($link,$encoding,$language,\$utf8_link); 220 221 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 222 # print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 223 224 if ($url_original_filename ne $utf8_link) { 225 my $outhandle = $self->{'outhandle'}; 226 227 print $outhandle "URL Encoding $url_original_filename\n"; 228 print $outhandle " ->$utf8_link\n"; 229 } 230 231 $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 232 } 233 } 234 235 # Given a filename in any encoding, will URL decode it to get back the original filename 236 # in the original encoding. Because this method is intended to work out the *original* 237 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 238 # string already exists in the local folder. 239 # Return the original filename corresponding to the parameter URL-encoded filename, and 240 # a decoded flag that is set to true iff URL-decoding had to be applied. 241 sub opt_url_decode { 242 my $self = shift (@_); 243 my ($link) = @_; 244 245 # Replace %XX's in URL with decoded value if required. 246 # Note that the filename may include the %XX in some situations 247 if ($link =~ m/\%[A-F0-9]{2}/i) { 248 if (!-e $link) { 249 $link = &unicode::url_decode($link); 250 } 251 } 252 253 return $link; 254 } 489 255 490 256 sub read_into_doc_obj … … 523 289 return ($process_status,$doc_obj); 524 290 } 525 526 sub new {527 my ($class) = shift (@_);528 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;529 push(@$pluginlist, $class);530 531 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});532 push(@{$hashArgOptLists->{"OptList"}},$options);533 534 535 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);536 537 if ($self->{'w3mir'}) {538 $self->{'file_is_url'} = 1;539 }540 $self->{'aux_files'} = {};541 $self->{'dir_num'} = 0;542 $self->{'file_num'} = 0;543 544 return bless $self, $class;545 }546 547 # may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$548 # if have eg <script language="javascript" src="img/lib.js@123">549 sub get_default_block_exp {550 my $self = shift (@_);551 552 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;553 return "";554 }555 556 sub get_default_process_exp {557 my $self = shift (@_);558 559 # the last option is an attempt to encode the concept of an html query ...560 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;561 }562 563 sub store_block_files564 {565 my $self =shift (@_);566 my ($filename_full_path, $block_hash) = @_;567 568 my $html_fname = $filename_full_path;569 my @file_blocks;570 571 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);572 573 # read in file ($text will be in utf8)574 my $raw_text = "";575 $self->read_file_no_decoding ($filename_full_path, \$raw_text);576 577 my $textref = \$raw_text;578 my $opencom = '(?:<!--|<!(?:—|—|--))';579 my $closecom = '(?:-->|(?:—|—|--)>)';580 $$textref =~ s/$opencom(.*?)$closecom//gs;581 582 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";583 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);584 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);585 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);586 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);587 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);588 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);589 590 if(!defined $self->{'utf8_to_original_filename'}) {591 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url592 $self->{'utf8_to_original_filename'} = {};593 }594 595 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {596 597 # remove quotes from link at start and end if necessary598 if ($link=~/^\"/) {599 $link=~s/^\"//;600 $link=~s/\"$//;601 }602 603 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html604 # some links may just be anchor names605 next unless ($link =~ /\S+/);606 607 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {608 # Turn relative file path into full path609 my $dirname = &File::Basename::dirname($filename_full_path);610 $link = &util::filename_cat($dirname, $link);611 }612 $link = $self->eval_dir_dots($link);613 614 # this is the actual filename on the filesystem (that the link refers to)615 my $url_original_filename = $self->opt_url_decode($link);616 617 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename618 my $utf8_link = "";619 $self->decode_text($link,$encoding,$language,\$utf8_link);620 621 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;622 # print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";623 624 if ($url_original_filename ne $utf8_link) {625 my $outhandle = $self->{'outhandle'};626 627 print $outhandle "URL Encoding $url_original_filename\n";628 print $outhandle " ->$utf8_link\n";629 }630 631 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;632 }633 }634 635 # Given a filename in any encoding, will URL decode it to get back the original filename636 # in the original encoding. Because this method is intended to work out the *original*637 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*638 # string already exists in the local folder.639 # Return the original filename corresponding to the parameter URL-encoded filename, and640 # a decoded flag that is set to true iff URL-decoding had to be applied.641 sub opt_url_decode {642 my $self = shift (@_);643 my ($link) = @_;644 645 # Replace %XX's in URL with decoded value if required.646 # Note that the filename may include the %XX in some situations647 if ($link =~ m/\%[A-F0-9]{2}/i) {648 if (!-e $link) {649 $link = &unicode::url_decode($link);650 }651 }652 653 return $link;654 }655 656 291 657 292 # do plugin specific processing of doc_obj … … 1547 1182 } 1548 1183 1184 sub HB_read_html_file { 1185 my $self = shift (@_); 1186 my ($htmlfile, $text) = @_; 1187 1188 # load in the file 1189 if (!open (FILE, $htmlfile)) { 1190 print STDERR "ERROR - could not open $htmlfile\n"; 1191 return; 1192 } 1193 1194 my $foundbody = 0; 1195 $self->HB_gettext (\$foundbody, $text, "FILE"); 1196 close FILE; 1197 1198 # just in case there was no <body> tag 1199 if (!$foundbody) { 1200 $foundbody = 1; 1201 open (FILE, $htmlfile) || return; 1202 $self->HB_gettext (\$foundbody, $text, "FILE"); 1203 close FILE; 1204 } 1205 # text is in utf8 1206 } 1207 1208 # converts the text to utf8, as ghtml does that for é etc. 1209 sub HB_gettext { 1210 my $self = shift (@_); 1211 my ($foundbody, $text, $handle) = @_; 1212 1213 my $line = ""; 1214 while (defined ($line = <$handle>)) { 1215 # look for body tag 1216 if (!$$foundbody) { 1217 if ($line =~ s/^.*<body[^>]*>//i) { 1218 $$foundbody = 1; 1219 } else { 1220 next; 1221 } 1222 } 1223 1224 # check for symbol fonts 1225 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 1226 my $font = $1; 1227 print STDERR "HBPlug::HB_gettext - warning removed font $font\n" 1228 if ($font !~ m/^arial$/i); 1229 } 1230 1231 $$text .= $line; 1232 } 1233 1234 if ($self->{'input_encoding'} eq "iso_8859_1") { 1235 # convert to utf-8 1236 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 1237 } 1238 # convert any alphanumeric character entities to their utf-8 1239 # equivalent for indexing purposes 1240 #&ghtml::convertcharentities ($$text); 1241 1242 $$text =~ s/\s+/ /g; # remove \n's 1243 } 1244 1245 sub HB_clean_section { 1246 my $self = shift (@_); 1247 my ($section) = @_; 1248 1249 # remove tags without a starting tag from the section 1250 my ($tag, $tagstart); 1251 while ($section =~ m/<\/([^>]{1,10})>/) { 1252 $tag = $1; 1253 $tagstart = index($section, "<$tag"); 1254 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); 1255 $section =~ s/<\/$tag>//; 1256 } 1257 1258 # remove extra paragraph tags 1259 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} 1260 1261 # remove extra stuff at the end of the section 1262 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>| |\s)$//i) {} 1263 1264 # add a newline at the beginning of each paragraph 1265 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; 1266 1267 # add a newline every 80 characters at a word boundary 1268 # Note: this regular expression puts a line feed before 1269 # the last word in each section, even when it is not 1270 # needed. 1271 $section =~ s/(.{1,80})\s/$1\n/g; 1272 1273 # fix up the image links 1274 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ 1275 <center><img src=\"$1\" \/><\/center><br\/>/ig; 1276 $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/ 1277 <center><img src=\"$1\" \/><\/center><br\/>/ig; 1278 1279 return $section; 1280 } 1281 1282 # Will convert the oldHDL format to the new HDL format (using the Section tag) 1283 sub convert_to_newHDLformat 1284 { 1285 my $self = shift (@_); 1286 my ($file,$cnfile) = @_; 1287 my $input_filename = $file; 1288 my $tmp_filename = $cnfile; 1289 1290 # write HTML tmp file with new HDL format 1291 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 1292 1293 # read in the file and do basic html cleaning (removing header etc) 1294 my $html = ""; 1295 $self->HB_read_html_file ($input_filename, \$html); 1296 1297 # process the file one section at a time 1298 my $curtoclevel = 1; 1299 my $firstsection = 1; 1300 my $toclevel = 0; 1301 while (length ($html) > 0) { 1302 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC(\d+)>>\s*(.*?)<p\b/<p/i) { 1303 $toclevel = $3; 1304 my $title = $4; 1305 my $sectiontext = ""; 1306 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC\d+>>)/$2/i) { 1307 $sectiontext = $1; 1308 } else { 1309 $sectiontext = $html; 1310 $html = ""; 1311 } 1312 1313 # remove tags and extra spaces from the title 1314 $title =~ s/<\/?[^>]+>//g; 1315 $title =~ s/^\s+|\s+$//g; 1316 1317 # close any sections below the current level and 1318 # create a new section (special case for the firstsection) 1319 print PROD "<!--\n"; 1320 while (($curtoclevel > $toclevel) || 1321 (!$firstsection && $curtoclevel == $toclevel)) { 1322 $curtoclevel--; 1323 print PROD "</Section>\n"; 1324 } 1325 if ($curtoclevel+1 < $toclevel) { 1326 print STDERR "WARNING - jump in toc levels in $input_filename " . 1327 "from $curtoclevel to $toclevel\n"; 1328 } 1329 while ($curtoclevel < $toclevel) { 1330 $curtoclevel++; 1331 } 1332 1333 if ($curtoclevel == 1) { 1334 # add the header tag 1335 print PROD "-->\n"; 1336 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n"; 1337 print PROD "<!--\n"; 1338 } 1339 1340 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n"; 1341 1342 print PROD "-->\n"; 1343 1344 # clean up the section html 1345 $sectiontext = $self->HB_clean_section($sectiontext); 1346 1347 print PROD "$sectiontext\n"; 1348 1349 } else { 1350 print STDERR "WARNING - leftover text\n" , $self->shorten($html), 1351 "\nin $input_filename\n"; 1352 last; 1353 } 1354 $firstsection = 0; 1355 } 1356 1357 print PROD "<!--\n"; 1358 while ($curtoclevel > 0) { 1359 $curtoclevel--; 1360 print PROD "</Section>\n"; 1361 } 1362 print PROD "-->\n"; 1363 1364 close (PROD) || die("Error Closing File: $tmp_filename $!"); 1365 1366 return $tmp_filename; 1367 } 1368 1369 sub shorten { 1370 my $self = shift (@_); 1371 my ($text) = @_; 1372 1373 return "\"$text\"" if (length($text) < 100); 1374 1375 return "\"" . substr ($text, 0, 50) . "\" ... \"" . 1376 substr ($text, length($text)-50) . "\""; 1377 } 1378 1379 sub convert_tidy_or_oldHDL_file 1380 { 1381 my $self = shift (@_); 1382 my ($file) = @_; 1383 my $input_filename = $file; 1384 1385 if (-d $input_filename) 1386 { 1387 return $input_filename; 1388 } 1389 1390 # get the input filename 1391 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 1392 my $base_dirname = $dirname; 1393 $suffix = lc($suffix); 1394 1395 # derive tmp filename from input filename 1396 # Remove any white space from filename -- no risk of name collision, and 1397 # makes later conversion by utils simpler. Leave spaces in path... 1398 # tidy up the filename with space, dot, hyphen between 1399 $tailname =~ s/\s+//g; 1400 $tailname =~ s/\.+//g; 1401 $tailname =~ s/\-+//g; 1402 # convert to utf-8 otherwise we have problems with the doc.xml file 1403 # later on 1404 &unicode::ensure_utf8(\$tailname); 1405 1406 # softlink to collection tmp dir 1407 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); 1408 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 1409 1410 my $test_dirname = ""; 1411 my $f_separator = &util::get_os_dirsep(); 1412 1413 if ($dirname =~ m/import$f_separator/) 1414 { 1415 $test_dirname = $'; #' 1416 1417 #print STDERR "init $'\n"; 1418 1419 while ($test_dirname =~ m/[$f_separator]/) 1420 { 1421 my $folderdirname = $`; 1422 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 1423 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 1424 $test_dirname = $'; #' 1425 } 1426 } 1427 1428 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 1429 1430 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp 1431 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 1432 { 1433 #convert the input file to a new style HDL 1434 my $hdl_output_filename = $input_filename; 1435 if ($self->{'old_style_HDL'}) 1436 { 1437 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 1438 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); 1439 } 1440 1441 #just for checking copy all other file from the base dir to tmp dir if it is not exists 1442 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 1443 my @files = grep {!/^\.+$/} readdir(DIR); 1444 close(DIR); 1445 1446 foreach my $file (@files) 1447 { 1448 my $src_file = &util::filename_cat($base_dirname,$file); 1449 my $dest_file = &util::filename_cat($tmp_dirname,$file); 1450 if ((!-e $dest_file) && (!-d $src_file)) 1451 { 1452 # just copy the original file back to the tmp directory 1453 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!"; 1454 } 1455 } 1456 1457 # tidy the input file 1458 my $tidy_output_filename = $hdl_output_filename; 1459 if ($self->{'use_realistic_book'}) 1460 { 1461 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 1462 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); 1463 } 1464 $tmp_filename = $tidy_output_filename; 1465 } 1466 else 1467 { 1468 if (!-e $tmp_filename) 1469 { 1470 # just copy the original file back to the tmp directory 1471 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!"; 1472 } 1473 } 1474 1475 return $tmp_filename; 1476 } 1477 1478 1479 # Will make the html input file as a proper XML file with removed font tag and 1480 # image size added to the img tag. 1481 # The tidying process takes place in a collection specific 'tmp' directory so 1482 # that we don't accidentally damage the input. 1483 sub tmp_tidy_file 1484 { 1485 my $self = shift (@_); 1486 my ($file,$cnfile) = @_; 1487 my $input_filename = $file; 1488 my $tmp_filename = $cnfile; 1489 1490 # get the input filename 1491 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 1492 1493 require HTML::TokeParser::Simple; 1494 1495 # create HTML parser to decode the input file 1496 my $parser = HTML::TokeParser::Simple->new($input_filename); 1497 1498 # write HTML tmp file without the font tag and image size are added to the img tag 1499 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 1500 while (my $token = $parser->get_token()) 1501 { 1502 # is it an img tag 1503 if ($token->is_start_tag('img')) 1504 { 1505 # get the attributes 1506 my $attr = $token->return_attr; 1507 1508 # get the full path to the image 1509 my $img_file = &util::filename_cat($dirname,$attr->{src}); 1510 1511 # set the width and height attribute 1512 ($attr->{width}, $attr->{height}) = imgsize($img_file); 1513 1514 # recreate the tag 1515 print PROD "<img"; 1516 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr; 1517 print PROD ">"; 1518 } 1519 # is it a font tag 1520 else 1521 { 1522 if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) 1523 { 1524 # remove font tag 1525 print PROD ""; 1526 } 1527 else 1528 { 1529 # print without changes 1530 print PROD $token->as_is; 1531 } 1532 } 1533 } 1534 close (PROD) || die("Error Closing File: $tmp_filename $!"); 1535 1536 # run html-tidy on the tmp file to make it a proper XML file 1537 my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`; 1538 1539 # write result back to the tmp file 1540 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 1541 print PROD $tidyfile; 1542 close (PROD) || die("Error Closing File: $tmp_filename $!"); 1543 1544 # return the output filename 1545 return $tmp_filename; 1546 } 1547 1549 1548 1;
Note:
See TracChangeset
for help on using the changeset viewer.