Changeset 10426
- Timestamp:
- 2005-08-05T15:16:47+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm
r10404 r10426 63 63 64 64 return bless $self, $class; 65 66 65 } 67 66 … … 93 92 print $outhandle "StructuredHTMLPlug: processing $file\n" 94 93 if $self->{'verbosity'} > 1; 95 96 94 my @head_and_body = split(/<body/i,$$textref); 97 95 my $head = shift(@head_and_body); 98 96 my $body_text = join("<body", @head_and_body); 99 97 98 if (defined $self->{'extracted_word_metadata_fields'}) { 99 my @doc_properties = split(/<xml>/i,$head); 100 my $doc_heading = shift(@doc_properties); 101 my $rest_doc_properties = join(" ", @doc_properties); 102 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties); 103 my $extracted_metadata = shift (@extracted_metadata); 104 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj); 105 } 106 100 107 # If checkout_toc is enables, it means to get rid of toc and tof contents. 101 108 # get rid of TOC and TOF sections and their title 102 if ($self->{'checkout_toc'}){109 #if (defined $self->{'checkout_toc'}){ 103 110 #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p> 104 111 # get rid of Table of Contents title and Table of Figures 105 112 #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Content.|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg; 106 113 #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Figure.|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg; 107 $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;108 $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;109 }110 111 if ( $self->{'title_header'}){114 #$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg; 115 #$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg; 116 #} 117 118 if (defined $self->{'title_header'}){ 112 119 $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is; 113 120 $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg; 114 121 } 115 if ( $self->{'level1_header'}){122 if (defined $self->{'level1_header'}){ 116 123 $self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is; 117 124 $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg; 118 125 } 119 if ( $self->{'level2_header'}){126 if (defined $self->{'level2_header'}){ 120 127 $self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is; 121 128 $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg; 122 129 } 123 130 124 if ( $self->{'level3_header'}){131 if (defined $self->{'level3_header'}){ 125 132 $self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is; 126 133 $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg; 127 134 } 128 129 135 # Tidy up extra new lines 130 136 $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg; 131 137 $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg; 132 138 133 139 my $body = "<body".$body_text; 134 140 135 141 my $section_text = $head; 136 142 $section_text .= "<!--\n<Section>\n-->\n"; 137 143 138 144 # split HTML text on <h1>, <h2> etc tags 139 145 my @h_split = split(/<h/i,$body); 140 146 141 147 my $hnum = 0; 142 148 143 149 my $sectionh1 = 0; 144 150 $section_text .= shift(@h_split); 145 151 146 152 my $hc; 147 153 foreach $hc ( @h_split ) … … 162 168 $h_text =~ s/^\s$//s; 163 169 $h_text =~ s/( )+\W*/ /sg; 164 170 165 171 if ($h_text =~ m/\w+/) 166 172 { … … 202 208 print $outhandle $spacing."$h_text\n" 203 209 if $self->{'verbosity'} > 2; 204 210 205 211 $sectionh1++ if ($hnum==1); 206 212 } … … 210 216 211 217 } 212 # $section_text .= "<!-- \n</Section>\n-->\n";213 #print STDERR "***HC = $hc\n";214 218 $section_text .= "<h$hc"; 215 219 } … … 232 236 233 237 $$textref = $section_text; 234 235 # should be textref not testref???236 #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;237 238 239 # should be textref not testref??? 240 #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi; 241 238 242 if ($sectionh1>0) 239 243 { … … 243 247 print $outhandle " Passing on the HTMLPlug\n" 244 248 if $self->{'verbosity'} > 1; 245 249 246 250 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g; 247 251 248 252 $$textref =~ s/( )+/ /sg; 249 250 ##$$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures?253 254 ## $$textref =~ s/<o:p> <\/o:p>//g; # used with VML to space figures? 251 255 252 256 $self->SUPER::process(@_); 253 257 254 258 # associate original file with doc object 255 259 my $cursection = $doc_obj->get_top_section(); … … 261 265 262 266 $doc_obj->associate_file($filename, "doc.doc", undef, $cursection); 263 267 264 268 my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>"; 265 269 $doc_obj->add_utf8_metadata ($cursection, "srclink", $doclink); … … 289 293 { 290 294 my ($self,$front,$back,$base_dir,$href) = @_; 291 295 292 296 # dig out width and height of image, if there 293 297 my $img_attributes = "$front back"; 294 298 my ($img_width) = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i); 295 299 my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i); 296 300 297 301 # derive local filename for image based on its URL 298 302 my $img_filename = $href; 299 303 $img_filename =~ s/^[^:]*:\/\///; 300 304 $img_filename = &util::filename_cat($base_dir, $img_filename); 301 305 302 306 # Replace %20's in URL with a space if required. Note that the filename 303 307 # may include the %20 in some situations … … 309 313 if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) { 310 314 # get image info on width and height 311 315 312 316 my $outhandle = $self->{'outhandle'}; 313 317 my $verbosity = $self->{'verbosity'}; … … 315 319 my ($image_type, $actual_width, $actual_height, $image_size) 316 320 = &ImagePlug::identify($img_filename, $outhandle, $verbosity); 317 321 318 322 #print STDERR "**** $actual_width x $actual_height"; 319 323 #print STDERR " (requested: $img_width x $img_height)\n"; … … 321 325 if (($img_width < $actual_width) || ($img_height < $actual_height)) { 322 326 print $outhandle "Resizing $img_filename\n" if ($verbosity > 0); 323 327 324 328 # derive new image name based on current image 325 329 my ($tailname, $dirname, $suffix) 326 330 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 327 331 328 332 my $resized_filename 329 333 = &util::filename_cat($dirname, $tailname."_resized".$suffix); 330 334 331 335 #print STDERR "**** suffix = $suffix\n"; 332 336 333 337 # Generate smaller image with convert 334 338 my $newsize = "$img_widthx$image_height"; … … 338 342 my $result = ''; 339 343 print $outhandle "ImageResize result: $result\n" if ($verbosity > 2); 340 341 } 342 } 343 344 } 345 } 344 346 return $href; 345 347 } 346 347 348 349 348 350 349 sub replace_images { … … 358 357 $back="\"$back"; 359 358 } 360 359 361 360 $link =~ s/\n/ /g; 362 361 363 362 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); 364 363 365 364 ## $href = $self->resize_if_necessary($front,$back,$base_dir,$href); 366 365 367 366 my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section); 368 367 369 368 return $front . $middle . $back; 370 369 } 371 370 371 sub extract_metadata 372 { 373 my $self = shift (@_); 374 my ($textref, $metadata, $doc_obj) = @_; 375 my $outhandle = $self->{'outhandle'}; 376 377 # metadata fields to extract/save. 'key' is the (lowercase) name of the 378 # html meta, 'value' is the metadata name for greenstone to use 379 my %find_fields = (); 380 my ($tag,$value); 381 382 my $orig_field = ""; 383 foreach my $field (split /,/, $self->{'extracted_word_metadata_fields'}) { 384 # support tag<tagname> 385 if ($field =~ /^(.*?)<(.*?)>$/) { 386 # "$2" is the user's preferred gs metadata name 387 $find_fields{lc($1)}=$2; # lc = lowercase 388 $orig_field = $1; 389 } else { # no <tagname> for mapping 390 # "$field" is the user's preferred gs metadata name 391 $find_fields{lc($field)}=$field; # lc = lowercase 392 $orig_field = $field; 393 } 394 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){ 395 $tag = $orig_field; 396 $value = $1; 397 if (!defined $value || !defined $tag){ 398 print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n"; 399 next; 400 } else { 401 # clean up and add 402 chomp($value); # remove trailing \n, if any 403 $tag = $find_fields{lc($tag)}; 404 print $outhandle " extracted \"$tag\" metadata \"$value\"\n" 405 if ($self->{'verbosity'} > 2); 406 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value); 407 } 408 } 409 } 410 } 372 411 373 412 1;
Note:
See TracChangeset
for help on using the changeset viewer.