source: gsdl/trunk/perllib/cpan/Image/ExifTool/HTML.pm@ 16842

Last change on this file since 16842 was 16842, checked in by davidb, 16 years ago

ExifTool added to cpan area to support metadata extraction from files such as JPEG. Primarily targetted as Image files (hence the Image folder name decided upon by the ExifTool author) it also can handle video such as flash and audio such as Wav

File size: 17.5 KB
Line 
1#------------------------------------------------------------------------------
2# File: HTML.pm
3#
4# Description: Read HTML meta information
5#
6# Revisions: 01/30/2007 - P. Harvey Created
7#
8# References: 1) http://www.w3.org/TR/html4/
9# 2) http://www.daisy.org/publications/specifications/daisy_202.html
10# 3) http://vancouver-webpages.com/META/metatags.detail.html
11# 4) http://www.html-reference.com/META.htm
12#------------------------------------------------------------------------------
13
14package Image::ExifTool::HTML;
15
16use strict;
17use vars qw($VERSION @ISA @EXPORT_OK);
18use Image::ExifTool qw(:DataAccess :Utils);
19use Image::ExifTool::PostScript;
20use Image::ExifTool::XMP qw(EscapeXML UnescapeXML);
21require Exporter;
22
23$VERSION = '1.03';
24@ISA = qw(Exporter);
25@EXPORT_OK = qw(EscapeHTML UnescapeHTML);
26
27# HTML 4 character entity references
28my %entityNum = (
29 'quot' => 34, 'eth' => 240, 'lsquo' => 8216,
30 'amp' => 38, 'ntilde' => 241, 'rsquo' => 8217,
31 'apos' => 39, 'ograve' => 242, 'sbquo' => 8218,
32 'lt' => 60, 'oacute' => 243, 'ldquo' => 8220,
33 'gt' => 62, 'ocirc' => 244, 'rdquo' => 8221,
34 'nbsp' => 160, 'otilde' => 245, 'bdquo' => 8222,
35 'iexcl' => 161, 'ouml' => 246, 'dagger' => 8224,
36 'cent' => 162, 'divide' => 247, 'Dagger' => 8225,
37 'pound' => 163, 'oslash' => 248, 'bull' => 8226,
38 'curren' => 164, 'ugrave' => 249, 'hellip' => 8230,
39 'yen' => 165, 'uacute' => 250, 'permil' => 8240,
40 'brvbar' => 166, 'ucirc' => 251, 'prime' => 8242,
41 'sect' => 167, 'uuml' => 252, 'Prime' => 8243,
42 'uml' => 168, 'yacute' => 253, 'lsaquo' => 8249,
43 'copy' => 169, 'thorn' => 254, 'rsaquo' => 8250,
44 'ordf' => 170, 'yuml' => 255, 'oline' => 8254,
45 'laquo' => 171, 'OElig' => 338, 'frasl' => 8260,
46 'not' => 172, 'oelig' => 339, 'euro' => 8364,
47 'shy' => 173, 'Scaron' => 352, 'image' => 8465,
48 'reg' => 174, 'scaron' => 353, 'weierp' => 8472,
49 'macr' => 175, 'Yuml' => 376, 'real' => 8476,
50 'deg' => 176, 'fnof' => 402, 'trade' => 8482,
51 'plusmn' => 177, 'circ' => 710, 'alefsym'=> 8501,
52 'sup2' => 178, 'tilde' => 732, 'larr' => 8592,
53 'sup3' => 179, 'Alpha' => 913, 'uarr' => 8593,
54 'acute' => 180, 'Beta' => 914, 'rarr' => 8594,
55 'micro' => 181, 'Gamma' => 915, 'darr' => 8595,
56 'para' => 182, 'Delta' => 916, 'harr' => 8596,
57 'middot' => 183, 'Epsilon'=> 917, 'crarr' => 8629,
58 'cedil' => 184, 'Zeta' => 918, 'lArr' => 8656,
59 'sup1' => 185, 'Eta' => 919, 'uArr' => 8657,
60 'ordm' => 186, 'Theta' => 920, 'rArr' => 8658,
61 'raquo' => 187, 'Iota' => 921, 'dArr' => 8659,
62 'frac14' => 188, 'Kappa' => 922, 'hArr' => 8660,
63 'frac12' => 189, 'Lambda' => 923, 'forall' => 8704,
64 'frac34' => 190, 'Mu' => 924, 'part' => 8706,
65 'iquest' => 191, 'Nu' => 925, 'exist' => 8707,
66 'Agrave' => 192, 'Xi' => 926, 'empty' => 8709,
67 'Aacute' => 193, 'Omicron'=> 927, 'nabla' => 8711,
68 'Acirc' => 194, 'Pi' => 928, 'isin' => 8712,
69 'Atilde' => 195, 'Rho' => 929, 'notin' => 8713,
70 'Auml' => 196, 'Sigma' => 931, 'ni' => 8715,
71 'Aring' => 197, 'Tau' => 932, 'prod' => 8719,
72 'AElig' => 198, 'Upsilon'=> 933, 'sum' => 8721,
73 'Ccedil' => 199, 'Phi' => 934, 'minus' => 8722,
74 'Egrave' => 200, 'Chi' => 935, 'lowast' => 8727,
75 'Eacute' => 201, 'Psi' => 936, 'radic' => 8730,
76 'Ecirc' => 202, 'Omega' => 937, 'prop' => 8733,
77 'Euml' => 203, 'alpha' => 945, 'infin' => 8734,
78 'Igrave' => 204, 'beta' => 946, 'ang' => 8736,
79 'Iacute' => 205, 'gamma' => 947, 'and' => 8743,
80 'Icirc' => 206, 'delta' => 948, 'or' => 8744,
81 'Iuml' => 207, 'epsilon'=> 949, 'cap' => 8745,
82 'ETH' => 208, 'zeta' => 950, 'cup' => 8746,
83 'Ntilde' => 209, 'eta' => 951, 'int' => 8747,
84 'Ograve' => 210, 'theta' => 952, 'there4' => 8756,
85 'Oacute' => 211, 'iota' => 953, 'sim' => 8764,
86 'Ocirc' => 212, 'kappa' => 954, 'cong' => 8773,
87 'Otilde' => 213, 'lambda' => 955, 'asymp' => 8776,
88 'Ouml' => 214, 'mu' => 956, 'ne' => 8800,
89 'times' => 215, 'nu' => 957, 'equiv' => 8801,
90 'Oslash' => 216, 'xi' => 958, 'le' => 8804,
91 'Ugrave' => 217, 'omicron'=> 959, 'ge' => 8805,
92 'Uacute' => 218, 'pi' => 960, 'sub' => 8834,
93 'Ucirc' => 219, 'rho' => 961, 'sup' => 8835,
94 'Uuml' => 220, 'sigmaf' => 962, 'nsub' => 8836,
95 'Yacute' => 221, 'sigma' => 963, 'sube' => 8838,
96 'THORN' => 222, 'tau' => 964, 'supe' => 8839,
97 'szlig' => 223, 'upsilon'=> 965, 'oplus' => 8853,
98 'agrave' => 224, 'phi' => 966, 'otimes' => 8855,
99 'aacute' => 225, 'chi' => 967, 'perp' => 8869,
100 'acirc' => 226, 'psi' => 968, 'sdot' => 8901,
101 'atilde' => 227, 'omega' => 969, 'lceil' => 8968,
102 'auml' => 228, 'thetasym'=>977, 'rceil' => 8969,
103 'aring' => 229, 'upsih' => 978, 'lfloor' => 8970,
104 'aelig' => 230, 'piv' => 982, 'rfloor' => 8971,
105 'ccedil' => 231, 'ensp' => 8194, 'lang' => 9001,
106 'egrave' => 232, 'emsp' => 8195, 'rang' => 9002,
107 'eacute' => 233, 'thinsp' => 8201, 'loz' => 9674,
108 'ecirc' => 234, 'zwnj' => 8204, 'spades' => 9824,
109 'euml' => 235, 'zwj' => 8205, 'clubs' => 9827,
110 'igrave' => 236, 'lrm' => 8206, 'hearts' => 9829,
111 'iacute' => 237, 'rlm' => 8207, 'diams' => 9830,
112 'icirc' => 238, 'ndash' => 8211,
113 'iuml' => 239, 'mdash' => 8212,
114);
115my %entityName; # look up entity names by number (built as necessary)
116
117# HTML info
118# (tag ID's are case insensitive and must be all lower case in tables)
119%Image::ExifTool::HTML::Main = (
120 GROUPS => { 2 => 'Document' },
121 NOTES => q{
122 Meta information extracted from the header of HTML and XHTML files. This is
123 a mix of information found in the C<META> elements and the C<TITLE> element.
124 },
125 dc => {
126 Name => 'DC',
127 SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' },
128 },
129 ncc => {
130 Name => 'NCC',
131 SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' },
132 },
133 prod => {
134 Name => 'Prod',
135 SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' },
136 },
137 vw96 => {
138 Name => 'VW96',
139 SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' },
140 },
141 'http-equiv' => {
142 Name => 'HTTP-equiv',
143 SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' },
144 },
145 abstract => { },
146 author => { },
147 classification => { },
148 copyright => { },
149 description => { },
150 distribution => { },
151 'doc-class' => { Name => 'DocClass' },
152 'doc-rights' => { Name => 'DocRights' },
153 'doc-type' => { Name => 'DocType' },
154 formatter => { },
155 generator => { },
156 googlebot => { Name => 'GoogleBot' },
157 keywords => { List => 1 },
158 mssmarttagspreventparsing => { Name => 'NoMSSmartTags' },
159 owner => { },
160 progid => { Name => 'ProgID' },
161 rating => { },
162 refresh => { },
163 'resource-type' => { Name => 'ResourceType' },
164 'revisit-after' => { Name => 'RevisitAfter' },
165 robots => { List => 1 },
166 title => { Notes => "the only extracted tag which isn't from an HTML META element" },
167);
168
169# ref 2
170%Image::ExifTool::HTML::dc = (
171 GROUPS => { 1 => 'HTML-dc', 2 => 'Document' },
172 NOTES => 'Dublin Core schema tags (also used in XMP).',
173 contributor => { Groups => { 2 => 'Author' }, List => 'Bag' },
174 coverage => { },
175 creator => { Groups => { 2 => 'Author' }, List => 'Seq' },
176 date => {
177 Groups => { 2 => 'Time' },
178 List => 'Seq',
179 PrintConv => '$self->ConvertDateTime($val)',
180 },
181 description => { },
182 'format' => { },
183 identifier => { },
184 language => { List => 'Bag' },
185 publisher => { Groups => { 2 => 'Author' }, List => 'Bag' },
186 relation => { List => 'Bag' },
187 rights => { Groups => { 2 => 'Author' } },
188 source => { Groups => { 2 => 'Author' } },
189 subject => { List => 'Bag' },
190 title => { },
191 type => { List => 'Bag' },
192);
193
194# ref 2
195%Image::ExifTool::HTML::ncc = (
196 GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' },
197 charset => { },
198 depth => { },
199 files => { },
200 footnotes => { },
201 generator => { },
202 kbytesize => { Name => 'KByteSize' },
203 maxpagenormal => { Name => 'MaxPageNormal' },
204 multimediatype => { Name => 'MultimediaType' },
205 narrator => { },
206 pagefront => { Name => 'PageFront' },
207 pagenormal => { Name => 'PageNormal' },
208 pagespecial => { Name => 'PageSpecial' },
209 prodnotes => { Name => 'ProdNotes' },
210 producer => { },
211 produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # yyyy-mm-dd
212 revision => { },
213 revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } },
214 setinfo => { Name => 'SetInfo' },
215 sidebars => { },
216 sourcedate => { Name => 'SourceDate', Groups => { 2 => 'Time' } },
217 sourceedition => { Name => 'SourceEdition' },
218 sourcepublisher => { Name => 'SourcePublisher' },
219 sourcerights => { Name => 'SourceRights' },
220 sourcetitle => { Name => 'SourceTitle' },
221 tocitems => { Name => 'TOCItems' },
222 totaltime => { Name => 'Duration' }, # hh:mm:ss
223);
224
225# ref 3
226%Image::ExifTool::HTML::vw96 = (
227 GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' },
228 objecttype => { Name => 'ObjectType' },
229);
230
231# ref 2
232%Image::ExifTool::HTML::prod = (
233 GROUPS => { 1 => 'HTML-prod', 2 => 'Document' },
234 reclocation => { Name => 'RecLocation' },
235 recengineer => { Name => 'RecEngineer' },
236);
237
238# ref 3/4
239%Image::ExifTool::HTML::equiv = (
240 GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' },
241 NOTES => 'These tags have a family 1 group name of "HTTP-equiv".',
242 'cache-control' => { Name => 'CacheControl' },
243 'content-disposition' => { Name => 'ContentDisposition' },
244 'content-language' => { Name => 'ContentLanguage' },
245 'content-script-type' => { Name => 'ContentScriptType' },
246 'content-style-type' => { Name => 'ContentStyleType' },
247 'content-type' => { Name => 'ContentType' },
248 'default-style' => { Name => 'DefaultStyle' },
249 expires => { },
250 'ext-cache' => { Name => 'ExtCache' },
251 imagetoolbar => { Name => 'ImageToolbar' },
252 lotus => { },
253 'page-enter' => { Name => 'PageEnter' },
254 'page-exit' => { Name => 'PageExit' },
255 'pics-label' => { Name => 'PicsLabel' },
256 pragma => { },
257 refresh => { },
258 'reply-to' => { Name => 'ReplyTo' },
259 'set-cookie' => { Name => 'SetCookie' },
260 'site-enter' => { Name => 'SiteEnter' },
261 'site-exit' => { Name => 'SiteExit' },
262 vary => { },
263 'window-target' => { Name => 'WindowTarget' },
264);
265
266#------------------------------------------------------------------------------
267# Convert single UTF-8 character to HTML character reference
268# Inputs: 0) UTF-8 character sequence
269# Returns: HML character reference (ie. "&quot;");
270# Note: Must be called via EscapeHTML to load name lookup
271sub EscapeChar($)
272{
273 my $ch = shift;
274 my ($val) = ($] >= 5.006001) ? unpack('U0U',$ch) : UnpackUTF8($ch);
275 return '?' unless defined $val;
276 return "&$entityName{$val};" if $entityName{$val};
277 return sprintf('&#x%x;',$val);
278}
279
280#------------------------------------------------------------------------------
281# Escape any special characters for HTML
282# Inputs: 0) UTF-8 string to be escaped
283# Returns: escaped string
284sub EscapeHTML($)
285{
286 my $str = shift;
287 # escape XML characters
288 $str = EscapeXML($str);
289 # escape other special characters if they exist
290 if ($str =~ /[\x80-\xff]/) {
291 # generate entity name lookup if necessary
292 unless (%entityName) {
293 local $_;
294 foreach (keys %entityNum) {
295 $entityName{$entityNum{$_}} = $_;
296 }
297 delete $entityName{39}; # 'apos' is not valid HTML
298 }
299 # supress warnings
300 local $SIG{'__WARN__'} = sub { 1 };
301 # escape any non-ascii characters for HTML
302 $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge;
303 }
304 return $str;
305}
306
307#------------------------------------------------------------------------------
308# Unescape all HTML character references
309# Inputs: 0) string to be unescaped
310# Returns: unescaped string
311sub UnescapeHTML($)
312{
313 return UnescapeXML(shift, \%entityNum);
314}
315
316#------------------------------------------------------------------------------
317# Extract information from a HTML file
318# Inputs: 0) ExifTool object reference, 1) DirInfo reference
319# Returns: 1 on success, 0 if this wasn't a valid HTML file
320sub ProcessHTML($$)
321{
322 my ($exifTool, $dirInfo) = @_;
323 my $raf = $$dirInfo{RAF};
324 my $verbose = $exifTool->Options('Verbose');
325 my ($buff, $err);
326
327 # validate HTML or XHTML file
328 $raf->Read($buff, 256) or return 0;
329 $buff =~ /^<(!DOCTYPE HTML|HTML|\?xml)/i or return 0;
330 $buff =~ /<(!DOCTYPE )?HTML/i or return 0 if $1 eq '?xml';
331 $exifTool->SetFileType();
332
333 $raf->Seek(0,0) or $exifTool->Warn('Seek error'), return 1;
334
335 my $oldsep = Image::ExifTool::PostScript::SetInputRecordSeparator($raf);
336 $oldsep or $exifTool->Warn('Invalid HTML data'), return 1;
337
338 # extract header information
339 my $doc;
340 while ($raf->ReadLine($buff)) {
341 if (not defined $doc) {
342 # look for 'head' element
343 next unless $buff =~ /<head\b/ig;
344 $doc = substr($buff, pos($buff));
345 next;
346 }
347 $doc .= $buff;
348 last if $buff =~ m{</head>}i;
349 }
350
351 # process all elements in header
352 my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main');
353 for (;;) {
354 last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg;
355 my ($tagName, $attrs) = ($1, $2);
356 my $tag = lc($tagName);
357 my ($val, $grp);
358 unless ($attrs =~ m{/$}) { # self-contained XHTML tags end in '/>'
359 # look for element close
360 my $pos = pos($doc);
361 my $close = "</$tagName>";
362 # the following doesn't work on Solaris Perl 5.6.1 due to Perl bug:
363 # if ($doc =~ m{(.*?)</$tagName>}sg) {
364 # $val = $1;
365 if ($doc =~ m{$close}sg) {
366 $val = substr($doc, $pos, pos($doc)-$pos-length($close));
367 } else {
368 pos($doc) = $pos;
369 next unless $tag eq 'meta'; # META tags don't need to be closed
370 }
371 }
372 my $table = $tagTablePtr;
373 # parse HTML META element
374 if ($tag eq 'meta') {
375 undef $tag;
376 # tag name is in NAME or HTTP-EQUIV attribute
377 if ($attrs =~ /name=['"]?([\w:.-]+)/si) {
378 $tagName = $1;
379 } elsif ($attrs =~ /http-equiv=['"]?([\w:.-]+)/si) {
380 $tagName = "HTTP-equiv.$1";
381 } else {
382 next; # no name
383 }
384 $tag = lc($tagName);
385 # tag value is in CONTENT attribute
386 $val = $2 if $attrs =~ /content=(['"])(.*?)\1/si;
387 next unless $tag and defined $val;
388 # isolate group name (separator is '.' in HTML, but ':' in ref 2)
389 if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) {
390 ($grp, $tag) = ($1, $2);
391 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp);
392 if ($tagInfo and $$tagInfo{SubDirectory}) {
393 $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
394 } else {
395 $tag = "$grp.$tag";
396 }
397 }
398 } else {
399 # the only non-META element we process is TITLE
400 next unless $tag eq 'title';
401 }
402 unless ($$table{$tag}) {
403 my $name = $tagName;
404 $name =~ s/\W+(\w)/\u$1/sg;
405 my $info = { Name => $name, Groups => { 0 => 'HTML' } };
406 $info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp;
407 Image::ExifTool::AddTagToTable($table, $tag, $info);
408 $exifTool->VPrint(0, " [adding $tag '$tagName']\n");
409 }
410 $val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces
411 $val = UnescapeHTML($val); # unescape HTML character references
412 $exifTool->HandleTag($table, $tag, $val);
413 }
414 $/ = $oldsep; # restore original separator
415 return 1;
416}
417
4181; # end
419
420__END__
421
422=head1 NAME
423
424Image::ExifTool::HTML - Read HTML meta information
425
426=head1 SYNOPSIS
427
428This module is used by Image::ExifTool
429
430=head1 DESCRIPTION
431
432This module contains routines required by Image::ExifTool to extract
433meta information from HTML documents.
434
435=head1 AUTHOR
436
437Copyright 2003-2007, Phil Harvey (phil at owl.phy.queensu.ca)
438
439This library is free software; you can redistribute it and/or modify it
440under the same terms as Perl itself.
441
442=head1 REFERENCES
443
444=over 4
445
446=item L<http://www.w3.org/TR/html4/>
447
448=item L<http://www.daisy.org/publications/specifications/daisy_202.html>
449
450=item L<http://vancouver-webpages.com/META/metatags.detail.html>
451
452=item L<http://www.html-reference.com/META.htm>
453
454=back
455
456=head1 SEE ALSO
457
458L<Image::ExifTool::TagNames/HTML Tags>,
459L<Image::ExifTool(3pm)|Image::ExifTool>
460
461=cut
462
Note: See TracBrowser for help on using the repository browser.