- Timestamp:
- 2011-06-01T12:33:42+12:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/cpan/Image/ExifTool/HTML.pm
r16842 r24107 21 21 require Exporter; 22 22 23 $VERSION = '1. 03';23 $VERSION = '1.11'; 24 24 @ISA = qw(Exporter); 25 25 @EXPORT_OK = qw(EscapeHTML UnescapeHTML); 26 27 sub SetHTMLCharset($$); 28 29 # convert HTML charset (lower case) to ExifTool Charset name 30 my %htmlCharset = ( 31 macintosh => 'MacRoman', 32 'iso-8859-1' => 'Latin', 33 'utf-8' => 'UTF8', 34 'windows-1252' => 'Latin', 35 ); 26 36 27 37 # HTML 4 character entity references … … 121 131 NOTES => q{ 122 132 Meta information extracted from the header of HTML and XHTML files. This is 123 a mix of information found in the C<META> elements and the C<TITLE> element. 133 a mix of information found in the C<META> elements, C<XML> element, and the 134 C<TITLE> element. 124 135 }, 125 136 dc => { … … 142 153 Name => 'HTTP-equiv', 143 154 SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' }, 155 }, 156 o => { 157 Name => 'Office', 158 SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' }, 144 159 }, 145 160 abstract => { }, … … 157 172 keywords => { List => 1 }, 158 173 mssmarttagspreventparsing => { Name => 'NoMSSmartTags' }, 174 originator => { }, 159 175 owner => { }, 160 176 progid => { Name => 'ProgID' }, … … 195 211 %Image::ExifTool::HTML::ncc = ( 196 212 GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' }, 197 charset => { },213 charset => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option 198 214 depth => { }, 199 215 files => { }, … … 209 225 prodnotes => { Name => 'ProdNotes' }, 210 226 producer => { }, 211 produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # yyyy-mm-dd227 produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd 212 228 revision => { }, 213 229 revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } }, … … 220 236 sourcetitle => { Name => 'SourceTitle' }, 221 237 tocitems => { Name => 'TOCItems' }, 222 totaltime => { Name => 'Duration' }, # hh:mm:ss238 totaltime => { Name => 'Duration' }, # HH:MM:SS 223 239 ); 224 240 … … 245 261 'content-script-type' => { Name => 'ContentScriptType' }, 246 262 'content-style-type' => { Name => 'ContentStyleType' }, 247 'content-type' => { Name => 'ContentType' }, 263 # note: setting the HTMLCharset like this will miss any tags which come earlier 264 'content-type' => { Name => 'ContentType', RawConv => \&SetHTMLCharset }, 248 265 'default-style' => { Name => 'DefaultStyle' }, 249 266 expires => { }, … … 264 281 ); 265 282 283 # MS Office namespace (ref PH) 284 %Image::ExifTool::HTML::Office = ( 285 GROUPS => { 1 => 'HTML-office', 2 => 'Document' }, 286 NOTES => 'Tags written by Microsoft Office applications.', 287 Subject => { }, 288 Author => { Groups => { 2 => 'Author' } }, 289 Keywords => { }, 290 Description => { }, 291 Template => { }, 292 LastAuthor => { Groups => { 2 => 'Author' } }, 293 Revision => { Name => 'RevisionNumber' }, 294 TotalTime => { Name => 'TotalEditTime', PrintConv => 'ConvertTimeSpan($val, 60)' }, 295 Created => { 296 Name => 'CreateDate', 297 Groups => { 2 => 'Time' }, 298 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', 299 PrintConv => '$self->ConvertDateTime($val)', 300 }, 301 LastSaved => { 302 Name => 'ModifyDate', 303 Groups => { 2 => 'Time' }, 304 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', 305 PrintConv => '$self->ConvertDateTime($val)', 306 }, 307 LastSaved => { 308 Name => 'ModifyDate', 309 Groups => { 2 => 'Time' }, 310 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', 311 PrintConv => '$self->ConvertDateTime($val)', 312 }, 313 LastPrinted => { 314 Name => 'LastPrinted', 315 Groups => { 2 => 'Time' }, 316 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)', 317 PrintConv => '$self->ConvertDateTime($val)', 318 }, 319 Pages => { }, 320 Words => { }, 321 Characters => { }, 322 Category => { }, 323 Manager => { }, 324 Company => { }, 325 Lines => { }, 326 Paragraphs => { }, 327 CharactersWithSpaces => { }, 328 Version => { Name => 'RevisionNumber' }, 329 ); 330 331 #------------------------------------------------------------------------------ 332 # Set HTMLCharset member based on content type 333 # Inputs: 0) content type string, 1) ExifTool ref 334 # Returns: original string 335 sub SetHTMLCharset($$) 336 { 337 my ($val, $exifTool) = @_; 338 $$exifTool{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/; 339 return $val; 340 } 341 266 342 #------------------------------------------------------------------------------ 267 343 # Convert single UTF-8 character to HTML character reference 268 344 # Inputs: 0) UTF-8 character sequence 269 # Returns: H ML character reference (ie. """);345 # Returns: HTML character reference (ie. """); 270 346 # Note: Must be called via EscapeHTML to load name lookup 271 347 sub EscapeChar($) 272 348 { 273 349 my $ch = shift; 274 my ($val) = ($] >= 5.006001) ? unpack('U0U',$ch) : UnpackUTF8($ch); 350 my $val; 351 if ($] < 5.006001) { 352 ($val) = Image::ExifTool::UnpackUTF8($ch); 353 } else { 354 # the meaning of "U0" is reversed as of Perl 5.10.0! 355 ($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch); 356 } 275 357 return '?' unless defined $val; 276 358 return "&$entityName{$val};" if $entityName{$val}; … … 297 379 delete $entityName{39}; # 'apos' is not valid HTML 298 380 } 299 # sup ress warnings381 # suppress warnings 300 382 local $SIG{'__WARN__'} = sub { 1 }; 301 383 # escape any non-ascii characters for HTML … … 322 404 my ($exifTool, $dirInfo) = @_; 323 405 my $raf = $$dirInfo{RAF}; 324 my $verbose = $exifTool->Options('Verbose'); 325 my ($buff, $err); 406 my $buff; 326 407 327 408 # validate HTML or XHTML file 328 409 $raf->Read($buff, 256) or return 0; 329 $buff =~ /^<(!DOCTYPE 330 $buff =~ /<(!DOCTYPE 410 $buff =~ /^<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0; 411 $buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $1 eq '?xml'; 331 412 $exifTool->SetFileType(); 332 413 333 414 $raf->Seek(0,0) or $exifTool->Warn('Seek error'), return 1; 334 415 335 my $oldsep = Image::ExifTool::PostScript::SetInputRecordSeparator($raf);336 $ oldsepor $exifTool->Warn('Invalid HTML data'), return 1;416 local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf); 417 $/ or $exifTool->Warn('Invalid HTML data'), return 1; 337 418 338 419 # extract header information … … 348 429 last if $buff =~ m{</head>}i; 349 430 } 431 return 1 unless defined $doc; 350 432 351 433 # process all elements in header … … 371 453 } 372 454 my $table = $tagTablePtr; 373 # parse HTML META element374 455 if ($tag eq 'meta') { 456 # parse HTML META element 375 457 undef $tag; 376 458 # tag name is in NAME or HTTP-EQUIV attribute … … 396 478 } 397 479 } 480 } elsif ($tag eq 'xml') { 481 $exifTool->VPrint(0, "Parsing XML\n"); 482 # parse XML tags (quick-and-dirty) 483 my $xml = $val; 484 while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) { 485 ($grp, $tag, $val) = ($1, $2, $4); 486 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp); 487 next unless $tagInfo and $$tagInfo{SubDirectory}; 488 $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); 489 unless ($$table{$tag}) { 490 my $name = ucfirst $tag; 491 $name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes 492 $name =~ s/\s(.)/\U$1/g; # capitalize all words in tag name 493 $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars) 494 Image::ExifTool::AddTagToTable($table, $tag, { Name => $name }); 495 $exifTool->VPrint(0, " [adding $tag '$name']\n"); 496 } 497 $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset}; 498 $exifTool->HandleTag($table, $tag, UnescapeXML($val)); 499 } 500 next; 398 501 } else { 399 # the only non-METAelement we process is TITLE502 # the only other element we process is TITLE 400 503 next unless $tag eq 'title'; 401 504 } … … 408 511 $exifTool->VPrint(0, " [adding $tag '$tagName']\n"); 409 512 } 513 # recode if necessary 514 $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset}; 410 515 $val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces 411 516 $val = UnescapeHTML($val); # unescape HTML character references 412 517 $exifTool->HandleTag($table, $tag, $val); 413 518 } 414 $/ = $oldsep; # restore original separator415 519 return 1; 416 520 } … … 435 539 =head1 AUTHOR 436 540 437 Copyright 2003-20 07, Phil Harvey (phil at owl.phy.queensu.ca)541 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca) 438 542 439 543 This library is free software; you can redistribute it and/or modify it
Note:
See TracChangeset
for help on using the changeset viewer.