Changeset 14012
- Timestamp:
- 2007-04-16T15:43:20+12:00 (17 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r13968 r14012 111 111 'desc' => "{HTMLPlug.tidy_html}", 112 112 'type' => "flag"}, 113 { 'name' => "old_style_HDL", 114 'desc' => "{HTMLPlug.old_style_HDL}", 115 'type' => "flag"} 113 116 ]; 114 117 … … 119 122 'args' => $arguments }; 120 123 121 # Will make the html input file as a proper XML file with removed font tag and 122 # image size added to the img tag. 123 # The tidying process takes place in a collection specific 'tmp' directory so 124 # that we don't accidentally damage the input. 125 sub tmp_tidy_file 124 125 sub HB_read_html_file { 126 my $self = shift (@_); 127 my ($htmlfile, $text) = @_; 128 129 # load in the file 130 if (!open (FILE, $htmlfile)) { 131 print STDERR "ERROR - could not open $htmlfile\n"; 132 return; 133 } 134 135 my $foundbody = 0; 136 $self->HB_gettext (\$foundbody, $text, "FILE"); 137 close FILE; 138 139 # just in case there was no <body> tag 140 if (!$foundbody) { 141 $foundbody = 1; 142 open (FILE, $htmlfile) || return; 143 $self->HB_gettext (\$foundbody, $text, "FILE"); 144 close FILE; 145 } 146 # text is in utf8 147 } 148 149 # converts the text to utf8, as ghtml does that for é etc. 150 sub HB_gettext { 151 my $self = shift (@_); 152 my ($foundbody, $text, $handle) = @_; 153 154 my $line = ""; 155 while (defined ($line = <$handle>)) { 156 # look for body tag 157 if (!$$foundbody) { 158 if ($line =~ s/^.*<body[^>]*>//i) { 159 $$foundbody = 1; 160 } else { 161 next; 162 } 163 } 164 165 # check for symbol fonts 166 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 167 my $font = $1; 168 print STDERR "HBPlug::HB_gettext - warning removed font $font\n" 169 if ($font !~ /^arial$/i); 170 } 171 172 $$text .= $line; 173 } 174 175 if ($self->{'input_encoding'} eq "iso_8859_1") { 176 # convert to utf-8 177 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 178 } 179 # convert any alphanumeric character entities to their utf-8 180 # equivalent for indexing purposes 181 &ghtml::convertcharentities ($$text); 182 183 $$text =~ s/\s+/ /g; # remove \n's 184 } 185 186 sub HB_clean_section { 187 my $self = shift (@_); 188 my ($section) = @_; 189 190 # remove tags without a starting tag from the section 191 my ($tag, $tagstart); 192 while ($section =~ /<\/([^>]{1,10})>/) { 193 $tag = $1; 194 $tagstart = index($section, "<$tag"); 195 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag"))); 196 $section =~ s/<\/$tag>//; 197 } 198 199 # remove extra paragraph tags 200 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {} 201 202 # remove extra stuff at the end of the section 203 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>| |\s)$//i) {} 204 205 # add a newline at the beginning of each paragraph 206 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi; 207 208 # add a newline every 80 characters at a word boundary 209 # Note: this regular expression puts a line feed before 210 # the last word in each section, even when it is not 211 # needed. 212 $section =~ s/(.{1,80})\s/$1\n/g; 213 214 # fix up the image links 215 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/ 216 <center><img src=\"$1\"><\/center><br>/ig; 217 $section =~ s/<<I>>\s*([^\.]+\.(png|jpg|gif))/ 218 <center><img src=\"$1\"><\/center><br>/ig; 219 220 return $section; 221 } 222 223 # Will convert the oldHDL format to the new HDL format (using the Section tag) 224 sub convert_to_newHDLformat 225 { 226 my $self = shift (@_); 227 my ($file,$cnfile) = @_; 228 my $input_filename = $file; 229 my $tmp_filename = $cnfile; 230 231 # write HTML tmp file with new HDL format 232 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 233 234 # read in the file and do basic html cleaning (removing header etc) 235 my $html = ""; 236 $self->HB_read_html_file ($input_filename, \$html); 237 238 # process the file one section at a time 239 my $curtoclevel = 1; 240 my $firstsection = 1; 241 my $toclevel = 0; 242 while (length ($html) > 0) { 243 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC(\d+)>>\s*(.*?)<p\b/<p/i) { 244 $toclevel = $3; 245 my $title = $4; 246 my $sectiontext = ""; 247 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)<<TOC\d+>>)/$2/i) { 248 $sectiontext = $1; 249 } else { 250 $sectiontext = $html; 251 $html = ""; 252 } 253 254 # remove tags and extra spaces from the title 255 $title =~ s/<\/?[^>]+>//g; 256 $title =~ s/^\s+|\s+$//g; 257 258 # close any sections below the current level and 259 # create a new section (special case for the firstsection) 260 print PROD "<!--\n"; 261 while (($curtoclevel > $toclevel) || 262 (!$firstsection && $curtoclevel == $toclevel)) { 263 $curtoclevel--; 264 print PROD "</Section>\n"; 265 } 266 if ($curtoclevel+1 < $toclevel) { 267 print STDERR "WARNING - jump in toc levels in $input_filename " . 268 "from $curtoclevel to $toclevel\n"; 269 } 270 while ($curtoclevel < $toclevel) { 271 $curtoclevel++; 272 } 273 274 if ($curtoclevel == 1) { 275 # add the header tag 276 print PROD "-->\n"; 277 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n"; 278 print PROD "<!--\n"; 279 } 280 281 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n"; 282 283 print PROD "-->\n"; 284 285 # clean up the section html 286 $sectiontext = $self->HB_clean_section($sectiontext); 287 288 print PROD "$sectiontext\n"; 289 290 } else { 291 print STDERR "WARNING - leftover text\n" , $self->shorten($html), 292 "\nin $input_filename\n"; 293 last; 294 } 295 $firstsection = 0; 296 } 297 298 print PROD "<!--\n"; 299 while (($curtoclevel > $toclevel) || 300 (!$firstsection && $curtoclevel == $toclevel)) { 301 $curtoclevel--; 302 print PROD "</Section>\n"; 303 } 304 print PROD "</Section>\n"; 305 print PROD "-->\n"; 306 307 close (PROD) || die("Error Closing File: $tmp_filename $!"); 308 309 return $tmp_filename; 310 } 311 312 sub convert_tidy_or_oldHDL_file 126 313 { 127 314 my $self = shift (@_); … … 160 347 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 161 348 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 162 349 163 350 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 164 165 # tidy the input file if it is a HTML-like file351 352 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp 166 353 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 354 { 355 # convert the input file to a new style HDL 356 my $hdl_output_filename = $input_filename; 357 if ($self->{'old_style_HDL'}) 358 { 359 $hdl_output_filename = &util::filename_cat($tmp_dirname, "newHDL_$tailname$suffix"); 360 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename); 361 } 362 363 # tidy the input file 364 my $tidy_output_filename = $hdl_output_filename; 365 if ($self->{'tidy_html'}) 366 { 367 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 368 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename); 369 } 370 371 $tmp_filename = $tidy_output_filename; 372 373 # just for checking copy all other file from the base dir to tmp dir if it is not exists 374 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 375 my @files = grep {!/^\.+$/} readdir(DIR); 376 close(DIR); 377 378 foreach my $file (@files) 379 { 380 my $src_file = &util::filename_cat($base_dirname,$file); 381 my $dest_file = &util::filename_cat($tmp_dirname,$file); 382 if ((!-e $dest_file) && (!-d $src_file)) 383 { 384 # just copy the original file back to the tmp directory 385 open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!"; 386 open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!"; 387 print TIDYOUT <TIDYIN>; 388 close TIDYIN; 389 close TIDYOUT; 390 } 391 } 392 } 393 else 167 394 { 395 if (!-e $tmp_filename) 396 { 397 # just copy the original file back to the tmp directory 398 open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!"; 399 open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!"; 400 print TIDYOUT <TIDYIN>; 401 close TIDYIN; 402 close TIDYOUT; 403 } 404 } 405 406 return $tmp_filename; 407 } 408 409 410 # Will make the html input file as a proper XML file with removed font tag and 411 # image size added to the img tag. 412 # The tidying process takes place in a collection specific 'tmp' directory so 413 # that we don't accidentally damage the input. 414 sub tmp_tidy_file 415 { 416 my $self = shift (@_); 417 my ($file,$cnfile) = @_; 418 my $input_filename = $file; 419 my $tmp_filename = $cnfile; 420 421 # get the input filename 422 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 423 168 424 # create HTML parser to decode the input file 169 425 my $parser = HTML::TokeParser::Simple->new($input_filename); … … 214 470 print PROD $tidyfile; 215 471 close (PROD) || die("Error Closing File: $tmp_filename $!"); 216 217 # just for checking copy all other file from the base dir to tmp dir if it is not exists218 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";219 my @files = grep {!/^\.+$/} readdir(DIR);220 close(DIR);221 222 foreach my $file (@files)223 {224 my $src_file = &util::filename_cat($base_dirname,$file);225 my $dest_file = &util::filename_cat($tmp_dirname,$file);226 if ((!-e $dest_file) && (!-d $src_file))227 {228 # just copy the original file back to the tmp directory229 open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";230 open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";231 print TIDYOUT <TIDYIN>;232 close TIDYIN;233 close TIDYOUT;234 }235 }236 }237 else238 {239 if (!-e $tmp_filename)240 {241 # just copy the original file back to the tmp directory242 open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";243 open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";244 print TIDYOUT <TIDYIN>;245 close TIDYIN;246 close TIDYOUT;247 }248 }249 472 250 473 # return the output filename … … 257 480 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 258 481 482 # check the process_exp and block_exp thing 483 my ($block_status,$filename) = $self->read_block(@_); 484 return $block_status if ((!defined $block_status) || ($block_status==0)); 485 259 486 # get the input file 260 487 my $input_filename = $file; … … 262 489 $suffix = lc($suffix); 263 490 264 if ( $self->{'tidy_html'})491 if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'})) 265 492 { 266 # tidy the input file if it is a HTML-like file267 #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))268 #{269 493 # set the file to be tidied 270 494 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/; 271 495 272 496 # get the tidied file 273 my $tidy_filename = $self->tmp_tidy_file($input_filename); 497 #my $tidy_filename = $self->tmp_tidy_file($input_filename); 498 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename); 274 499 275 500 # derive tmp filename from input filename … … 279 504 $file = "$tailname$suffix"; 280 505 $base_dir = $dirname; 281 #}282 506 } 283 507 -
trunk/gsdl/perllib/strings.properties
r13968 r14012 819 819 HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format. 820 820 821 HTMLPlug.old_style_HDL:To mark whether the file in this collection is sectionalized using the old HDL's section style. 822 821 823 ImagePlug.converttotype:Convert main image to format 's'. 822 824
Note:
See TracChangeset
for help on using the changeset viewer.