source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/HTML.pm@ 24107

Last change on this file since 24107 was 24107, checked in by sjm84, 13 years ago

Updating the ExifTool perl modules

File size: 21.5 KB
Line 
1#------------------------------------------------------------------------------
2# File: HTML.pm
3#
4# Description: Read HTML meta information
5#
6# Revisions: 01/30/2007 - P. Harvey Created
7#
8# References: 1) http://www.w3.org/TR/html4/
9# 2) http://www.daisy.org/publications/specifications/daisy_202.html
10# 3) http://vancouver-webpages.com/META/metatags.detail.html
11# 4) http://www.html-reference.com/META.htm
12#------------------------------------------------------------------------------
13
14package Image::ExifTool::HTML;
15
16use strict;
17use vars qw($VERSION @ISA @EXPORT_OK);
18use Image::ExifTool qw(:DataAccess :Utils);
19use Image::ExifTool::PostScript;
20use Image::ExifTool::XMP qw(EscapeXML UnescapeXML);
21require Exporter;
22
23$VERSION = '1.11';
24@ISA = qw(Exporter);
25@EXPORT_OK = qw(EscapeHTML UnescapeHTML);
26
27sub SetHTMLCharset($$);
28
29# convert HTML charset (lower case) to ExifTool Charset name
30my %htmlCharset = (
31 macintosh => 'MacRoman',
32 'iso-8859-1' => 'Latin',
33 'utf-8' => 'UTF8',
34 'windows-1252' => 'Latin',
35);
36
37# HTML 4 character entity references
38my %entityNum = (
39 'quot' => 34, 'eth' => 240, 'lsquo' => 8216,
40 'amp' => 38, 'ntilde' => 241, 'rsquo' => 8217,
41 'apos' => 39, 'ograve' => 242, 'sbquo' => 8218,
42 'lt' => 60, 'oacute' => 243, 'ldquo' => 8220,
43 'gt' => 62, 'ocirc' => 244, 'rdquo' => 8221,
44 'nbsp' => 160, 'otilde' => 245, 'bdquo' => 8222,
45 'iexcl' => 161, 'ouml' => 246, 'dagger' => 8224,
46 'cent' => 162, 'divide' => 247, 'Dagger' => 8225,
47 'pound' => 163, 'oslash' => 248, 'bull' => 8226,
48 'curren' => 164, 'ugrave' => 249, 'hellip' => 8230,
49 'yen' => 165, 'uacute' => 250, 'permil' => 8240,
50 'brvbar' => 166, 'ucirc' => 251, 'prime' => 8242,
51 'sect' => 167, 'uuml' => 252, 'Prime' => 8243,
52 'uml' => 168, 'yacute' => 253, 'lsaquo' => 8249,
53 'copy' => 169, 'thorn' => 254, 'rsaquo' => 8250,
54 'ordf' => 170, 'yuml' => 255, 'oline' => 8254,
55 'laquo' => 171, 'OElig' => 338, 'frasl' => 8260,
56 'not' => 172, 'oelig' => 339, 'euro' => 8364,
57 'shy' => 173, 'Scaron' => 352, 'image' => 8465,
58 'reg' => 174, 'scaron' => 353, 'weierp' => 8472,
59 'macr' => 175, 'Yuml' => 376, 'real' => 8476,
60 'deg' => 176, 'fnof' => 402, 'trade' => 8482,
61 'plusmn' => 177, 'circ' => 710, 'alefsym'=> 8501,
62 'sup2' => 178, 'tilde' => 732, 'larr' => 8592,
63 'sup3' => 179, 'Alpha' => 913, 'uarr' => 8593,
64 'acute' => 180, 'Beta' => 914, 'rarr' => 8594,
65 'micro' => 181, 'Gamma' => 915, 'darr' => 8595,
66 'para' => 182, 'Delta' => 916, 'harr' => 8596,
67 'middot' => 183, 'Epsilon'=> 917, 'crarr' => 8629,
68 'cedil' => 184, 'Zeta' => 918, 'lArr' => 8656,
69 'sup1' => 185, 'Eta' => 919, 'uArr' => 8657,
70 'ordm' => 186, 'Theta' => 920, 'rArr' => 8658,
71 'raquo' => 187, 'Iota' => 921, 'dArr' => 8659,
72 'frac14' => 188, 'Kappa' => 922, 'hArr' => 8660,
73 'frac12' => 189, 'Lambda' => 923, 'forall' => 8704,
74 'frac34' => 190, 'Mu' => 924, 'part' => 8706,
75 'iquest' => 191, 'Nu' => 925, 'exist' => 8707,
76 'Agrave' => 192, 'Xi' => 926, 'empty' => 8709,
77 'Aacute' => 193, 'Omicron'=> 927, 'nabla' => 8711,
78 'Acirc' => 194, 'Pi' => 928, 'isin' => 8712,
79 'Atilde' => 195, 'Rho' => 929, 'notin' => 8713,
80 'Auml' => 196, 'Sigma' => 931, 'ni' => 8715,
81 'Aring' => 197, 'Tau' => 932, 'prod' => 8719,
82 'AElig' => 198, 'Upsilon'=> 933, 'sum' => 8721,
83 'Ccedil' => 199, 'Phi' => 934, 'minus' => 8722,
84 'Egrave' => 200, 'Chi' => 935, 'lowast' => 8727,
85 'Eacute' => 201, 'Psi' => 936, 'radic' => 8730,
86 'Ecirc' => 202, 'Omega' => 937, 'prop' => 8733,
87 'Euml' => 203, 'alpha' => 945, 'infin' => 8734,
88 'Igrave' => 204, 'beta' => 946, 'ang' => 8736,
89 'Iacute' => 205, 'gamma' => 947, 'and' => 8743,
90 'Icirc' => 206, 'delta' => 948, 'or' => 8744,
91 'Iuml' => 207, 'epsilon'=> 949, 'cap' => 8745,
92 'ETH' => 208, 'zeta' => 950, 'cup' => 8746,
93 'Ntilde' => 209, 'eta' => 951, 'int' => 8747,
94 'Ograve' => 210, 'theta' => 952, 'there4' => 8756,
95 'Oacute' => 211, 'iota' => 953, 'sim' => 8764,
96 'Ocirc' => 212, 'kappa' => 954, 'cong' => 8773,
97 'Otilde' => 213, 'lambda' => 955, 'asymp' => 8776,
98 'Ouml' => 214, 'mu' => 956, 'ne' => 8800,
99 'times' => 215, 'nu' => 957, 'equiv' => 8801,
100 'Oslash' => 216, 'xi' => 958, 'le' => 8804,
101 'Ugrave' => 217, 'omicron'=> 959, 'ge' => 8805,
102 'Uacute' => 218, 'pi' => 960, 'sub' => 8834,
103 'Ucirc' => 219, 'rho' => 961, 'sup' => 8835,
104 'Uuml' => 220, 'sigmaf' => 962, 'nsub' => 8836,
105 'Yacute' => 221, 'sigma' => 963, 'sube' => 8838,
106 'THORN' => 222, 'tau' => 964, 'supe' => 8839,
107 'szlig' => 223, 'upsilon'=> 965, 'oplus' => 8853,
108 'agrave' => 224, 'phi' => 966, 'otimes' => 8855,
109 'aacute' => 225, 'chi' => 967, 'perp' => 8869,
110 'acirc' => 226, 'psi' => 968, 'sdot' => 8901,
111 'atilde' => 227, 'omega' => 969, 'lceil' => 8968,
112 'auml' => 228, 'thetasym'=>977, 'rceil' => 8969,
113 'aring' => 229, 'upsih' => 978, 'lfloor' => 8970,
114 'aelig' => 230, 'piv' => 982, 'rfloor' => 8971,
115 'ccedil' => 231, 'ensp' => 8194, 'lang' => 9001,
116 'egrave' => 232, 'emsp' => 8195, 'rang' => 9002,
117 'eacute' => 233, 'thinsp' => 8201, 'loz' => 9674,
118 'ecirc' => 234, 'zwnj' => 8204, 'spades' => 9824,
119 'euml' => 235, 'zwj' => 8205, 'clubs' => 9827,
120 'igrave' => 236, 'lrm' => 8206, 'hearts' => 9829,
121 'iacute' => 237, 'rlm' => 8207, 'diams' => 9830,
122 'icirc' => 238, 'ndash' => 8211,
123 'iuml' => 239, 'mdash' => 8212,
124);
125my %entityName; # look up entity names by number (built as necessary)
126
127# HTML info
128# (tag ID's are case insensitive and must be all lower case in tables)
129%Image::ExifTool::HTML::Main = (
130 GROUPS => { 2 => 'Document' },
131 NOTES => q{
132 Meta information extracted from the header of HTML and XHTML files. This is
133 a mix of information found in the C<META> elements, C<XML> element, and the
134 C<TITLE> element.
135 },
136 dc => {
137 Name => 'DC',
138 SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' },
139 },
140 ncc => {
141 Name => 'NCC',
142 SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' },
143 },
144 prod => {
145 Name => 'Prod',
146 SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' },
147 },
148 vw96 => {
149 Name => 'VW96',
150 SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' },
151 },
152 'http-equiv' => {
153 Name => 'HTTP-equiv',
154 SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' },
155 },
156 o => {
157 Name => 'Office',
158 SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' },
159 },
160 abstract => { },
161 author => { },
162 classification => { },
163 copyright => { },
164 description => { },
165 distribution => { },
166 'doc-class' => { Name => 'DocClass' },
167 'doc-rights' => { Name => 'DocRights' },
168 'doc-type' => { Name => 'DocType' },
169 formatter => { },
170 generator => { },
171 googlebot => { Name => 'GoogleBot' },
172 keywords => { List => 1 },
173 mssmarttagspreventparsing => { Name => 'NoMSSmartTags' },
174 originator => { },
175 owner => { },
176 progid => { Name => 'ProgID' },
177 rating => { },
178 refresh => { },
179 'resource-type' => { Name => 'ResourceType' },
180 'revisit-after' => { Name => 'RevisitAfter' },
181 robots => { List => 1 },
182 title => { Notes => "the only extracted tag which isn't from an HTML META element" },
183);
184
185# ref 2
186%Image::ExifTool::HTML::dc = (
187 GROUPS => { 1 => 'HTML-dc', 2 => 'Document' },
188 NOTES => 'Dublin Core schema tags (also used in XMP).',
189 contributor => { Groups => { 2 => 'Author' }, List => 'Bag' },
190 coverage => { },
191 creator => { Groups => { 2 => 'Author' }, List => 'Seq' },
192 date => {
193 Groups => { 2 => 'Time' },
194 List => 'Seq',
195 PrintConv => '$self->ConvertDateTime($val)',
196 },
197 description => { },
198 'format' => { },
199 identifier => { },
200 language => { List => 'Bag' },
201 publisher => { Groups => { 2 => 'Author' }, List => 'Bag' },
202 relation => { List => 'Bag' },
203 rights => { Groups => { 2 => 'Author' } },
204 source => { Groups => { 2 => 'Author' } },
205 subject => { List => 'Bag' },
206 title => { },
207 type => { List => 'Bag' },
208);
209
210# ref 2
211%Image::ExifTool::HTML::ncc = (
212 GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' },
213 charset => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option
214 depth => { },
215 files => { },
216 footnotes => { },
217 generator => { },
218 kbytesize => { Name => 'KByteSize' },
219 maxpagenormal => { Name => 'MaxPageNormal' },
220 multimediatype => { Name => 'MultimediaType' },
221 narrator => { },
222 pagefront => { Name => 'PageFront' },
223 pagenormal => { Name => 'PageNormal' },
224 pagespecial => { Name => 'PageSpecial' },
225 prodnotes => { Name => 'ProdNotes' },
226 producer => { },
227 produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd
228 revision => { },
229 revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } },
230 setinfo => { Name => 'SetInfo' },
231 sidebars => { },
232 sourcedate => { Name => 'SourceDate', Groups => { 2 => 'Time' } },
233 sourceedition => { Name => 'SourceEdition' },
234 sourcepublisher => { Name => 'SourcePublisher' },
235 sourcerights => { Name => 'SourceRights' },
236 sourcetitle => { Name => 'SourceTitle' },
237 tocitems => { Name => 'TOCItems' },
238 totaltime => { Name => 'Duration' }, # HH:MM:SS
239);
240
241# ref 3
242%Image::ExifTool::HTML::vw96 = (
243 GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' },
244 objecttype => { Name => 'ObjectType' },
245);
246
247# ref 2
248%Image::ExifTool::HTML::prod = (
249 GROUPS => { 1 => 'HTML-prod', 2 => 'Document' },
250 reclocation => { Name => 'RecLocation' },
251 recengineer => { Name => 'RecEngineer' },
252);
253
254# ref 3/4
255%Image::ExifTool::HTML::equiv = (
256 GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' },
257 NOTES => 'These tags have a family 1 group name of "HTTP-equiv".',
258 'cache-control' => { Name => 'CacheControl' },
259 'content-disposition' => { Name => 'ContentDisposition' },
260 'content-language' => { Name => 'ContentLanguage' },
261 'content-script-type' => { Name => 'ContentScriptType' },
262 'content-style-type' => { Name => 'ContentStyleType' },
263 # note: setting the HTMLCharset like this will miss any tags which come earlier
264 'content-type' => { Name => 'ContentType', RawConv => \&SetHTMLCharset },
265 'default-style' => { Name => 'DefaultStyle' },
266 expires => { },
267 'ext-cache' => { Name => 'ExtCache' },
268 imagetoolbar => { Name => 'ImageToolbar' },
269 lotus => { },
270 'page-enter' => { Name => 'PageEnter' },
271 'page-exit' => { Name => 'PageExit' },
272 'pics-label' => { Name => 'PicsLabel' },
273 pragma => { },
274 refresh => { },
275 'reply-to' => { Name => 'ReplyTo' },
276 'set-cookie' => { Name => 'SetCookie' },
277 'site-enter' => { Name => 'SiteEnter' },
278 'site-exit' => { Name => 'SiteExit' },
279 vary => { },
280 'window-target' => { Name => 'WindowTarget' },
281);
282
283# MS Office namespace (ref PH)
284%Image::ExifTool::HTML::Office = (
285 GROUPS => { 1 => 'HTML-office', 2 => 'Document' },
286 NOTES => 'Tags written by Microsoft Office applications.',
287 Subject => { },
288 Author => { Groups => { 2 => 'Author' } },
289 Keywords => { },
290 Description => { },
291 Template => { },
292 LastAuthor => { Groups => { 2 => 'Author' } },
293 Revision => { Name => 'RevisionNumber' },
294 TotalTime => { Name => 'TotalEditTime', PrintConv => 'ConvertTimeSpan($val, 60)' },
295 Created => {
296 Name => 'CreateDate',
297 Groups => { 2 => 'Time' },
298 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
299 PrintConv => '$self->ConvertDateTime($val)',
300 },
301 LastSaved => {
302 Name => 'ModifyDate',
303 Groups => { 2 => 'Time' },
304 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
305 PrintConv => '$self->ConvertDateTime($val)',
306 },
307 LastSaved => {
308 Name => 'ModifyDate',
309 Groups => { 2 => 'Time' },
310 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
311 PrintConv => '$self->ConvertDateTime($val)',
312 },
313 LastPrinted => {
314 Name => 'LastPrinted',
315 Groups => { 2 => 'Time' },
316 ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
317 PrintConv => '$self->ConvertDateTime($val)',
318 },
319 Pages => { },
320 Words => { },
321 Characters => { },
322 Category => { },
323 Manager => { },
324 Company => { },
325 Lines => { },
326 Paragraphs => { },
327 CharactersWithSpaces => { },
328 Version => { Name => 'RevisionNumber' },
329);
330
331#------------------------------------------------------------------------------
332# Set HTMLCharset member based on content type
333# Inputs: 0) content type string, 1) ExifTool ref
334# Returns: original string
335sub SetHTMLCharset($$)
336{
337 my ($val, $exifTool) = @_;
338 $$exifTool{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/;
339 return $val;
340}
341
342#------------------------------------------------------------------------------
343# Convert single UTF-8 character to HTML character reference
344# Inputs: 0) UTF-8 character sequence
345# Returns: HTML character reference (ie. "&quot;");
346# Note: Must be called via EscapeHTML to load name lookup
347sub EscapeChar($)
348{
349 my $ch = shift;
350 my $val;
351 if ($] < 5.006001) {
352 ($val) = Image::ExifTool::UnpackUTF8($ch);
353 } else {
354 # the meaning of "U0" is reversed as of Perl 5.10.0!
355 ($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch);
356 }
357 return '?' unless defined $val;
358 return "&$entityName{$val};" if $entityName{$val};
359 return sprintf('&#x%x;',$val);
360}
361
362#------------------------------------------------------------------------------
363# Escape any special characters for HTML
364# Inputs: 0) UTF-8 string to be escaped
365# Returns: escaped string
366sub EscapeHTML($)
367{
368 my $str = shift;
369 # escape XML characters
370 $str = EscapeXML($str);
371 # escape other special characters if they exist
372 if ($str =~ /[\x80-\xff]/) {
373 # generate entity name lookup if necessary
374 unless (%entityName) {
375 local $_;
376 foreach (keys %entityNum) {
377 $entityName{$entityNum{$_}} = $_;
378 }
379 delete $entityName{39}; # 'apos' is not valid HTML
380 }
381 # suppress warnings
382 local $SIG{'__WARN__'} = sub { 1 };
383 # escape any non-ascii characters for HTML
384 $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge;
385 }
386 return $str;
387}
388
389#------------------------------------------------------------------------------
390# Unescape all HTML character references
391# Inputs: 0) string to be unescaped
392# Returns: unescaped string
393sub UnescapeHTML($)
394{
395 return UnescapeXML(shift, \%entityNum);
396}
397
398#------------------------------------------------------------------------------
399# Extract information from a HTML file
400# Inputs: 0) ExifTool object reference, 1) DirInfo reference
401# Returns: 1 on success, 0 if this wasn't a valid HTML file
402sub ProcessHTML($$)
403{
404 my ($exifTool, $dirInfo) = @_;
405 my $raf = $$dirInfo{RAF};
406 my $buff;
407
408 # validate HTML or XHTML file
409 $raf->Read($buff, 256) or return 0;
410 $buff =~ /^<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0;
411 $buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $1 eq '?xml';
412 $exifTool->SetFileType();
413
414 $raf->Seek(0,0) or $exifTool->Warn('Seek error'), return 1;
415
416 local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf);
417 $/ or $exifTool->Warn('Invalid HTML data'), return 1;
418
419 # extract header information
420 my $doc;
421 while ($raf->ReadLine($buff)) {
422 if (not defined $doc) {
423 # look for 'head' element
424 next unless $buff =~ /<head\b/ig;
425 $doc = substr($buff, pos($buff));
426 next;
427 }
428 $doc .= $buff;
429 last if $buff =~ m{</head>}i;
430 }
431 return 1 unless defined $doc;
432
433 # process all elements in header
434 my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main');
435 for (;;) {
436 last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg;
437 my ($tagName, $attrs) = ($1, $2);
438 my $tag = lc($tagName);
439 my ($val, $grp);
440 unless ($attrs =~ m{/$}) { # self-contained XHTML tags end in '/>'
441 # look for element close
442 my $pos = pos($doc);
443 my $close = "</$tagName>";
444 # the following doesn't work on Solaris Perl 5.6.1 due to Perl bug:
445 # if ($doc =~ m{(.*?)</$tagName>}sg) {
446 # $val = $1;
447 if ($doc =~ m{$close}sg) {
448 $val = substr($doc, $pos, pos($doc)-$pos-length($close));
449 } else {
450 pos($doc) = $pos;
451 next unless $tag eq 'meta'; # META tags don't need to be closed
452 }
453 }
454 my $table = $tagTablePtr;
455 if ($tag eq 'meta') {
456 # parse HTML META element
457 undef $tag;
458 # tag name is in NAME or HTTP-EQUIV attribute
459 if ($attrs =~ /name=['"]?([\w:.-]+)/si) {
460 $tagName = $1;
461 } elsif ($attrs =~ /http-equiv=['"]?([\w:.-]+)/si) {
462 $tagName = "HTTP-equiv.$1";
463 } else {
464 next; # no name
465 }
466 $tag = lc($tagName);
467 # tag value is in CONTENT attribute
468 $val = $2 if $attrs =~ /content=(['"])(.*?)\1/si;
469 next unless $tag and defined $val;
470 # isolate group name (separator is '.' in HTML, but ':' in ref 2)
471 if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) {
472 ($grp, $tag) = ($1, $2);
473 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp);
474 if ($tagInfo and $$tagInfo{SubDirectory}) {
475 $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
476 } else {
477 $tag = "$grp.$tag";
478 }
479 }
480 } elsif ($tag eq 'xml') {
481 $exifTool->VPrint(0, "Parsing XML\n");
482 # parse XML tags (quick-and-dirty)
483 my $xml = $val;
484 while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) {
485 ($grp, $tag, $val) = ($1, $2, $4);
486 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp);
487 next unless $tagInfo and $$tagInfo{SubDirectory};
488 $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
489 unless ($$table{$tag}) {
490 my $name = ucfirst $tag;
491 $name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes
492 $name =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
493 $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars)
494 Image::ExifTool::AddTagToTable($table, $tag, { Name => $name });
495 $exifTool->VPrint(0, " [adding $tag '$name']\n");
496 }
497 $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset};
498 $exifTool->HandleTag($table, $tag, UnescapeXML($val));
499 }
500 next;
501 } else {
502 # the only other element we process is TITLE
503 next unless $tag eq 'title';
504 }
505 unless ($$table{$tag}) {
506 my $name = $tagName;
507 $name =~ s/\W+(\w)/\u$1/sg;
508 my $info = { Name => $name, Groups => { 0 => 'HTML' } };
509 $info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp;
510 Image::ExifTool::AddTagToTable($table, $tag, $info);
511 $exifTool->VPrint(0, " [adding $tag '$tagName']\n");
512 }
513 # recode if necessary
514 $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset};
515 $val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces
516 $val = UnescapeHTML($val); # unescape HTML character references
517 $exifTool->HandleTag($table, $tag, $val);
518 }
519 return 1;
520}
521
5221; # end
523
524__END__
525
526=head1 NAME
527
528Image::ExifTool::HTML - Read HTML meta information
529
530=head1 SYNOPSIS
531
532This module is used by Image::ExifTool
533
534=head1 DESCRIPTION
535
536This module contains routines required by Image::ExifTool to extract
537meta information from HTML documents.
538
539=head1 AUTHOR
540
541Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
542
543This library is free software; you can redistribute it and/or modify it
544under the same terms as Perl itself.
545
546=head1 REFERENCES
547
548=over 4
549
550=item L<http://www.w3.org/TR/html4/>
551
552=item L<http://www.daisy.org/publications/specifications/daisy_202.html>
553
554=item L<http://vancouver-webpages.com/META/metatags.detail.html>
555
556=item L<http://www.html-reference.com/META.htm>
557
558=back
559
560=head1 SEE ALSO
561
562L<Image::ExifTool::TagNames/HTML Tags>,
563L<Image::ExifTool(3pm)|Image::ExifTool>
564
565=cut
566
Note: See TracBrowser for help on using the repository browser.