1 | #------------------------------------------------------------------------------
|
---|
2 | # File: HTML.pm
|
---|
3 | #
|
---|
4 | # Description: Read HTML meta information
|
---|
5 | #
|
---|
6 | # Revisions: 01/30/2007 - P. Harvey Created
|
---|
7 | #
|
---|
8 | # References: 1) http://www.w3.org/TR/html4/
|
---|
9 | # 2) http://www.daisy.org/publications/specifications/daisy_202.html
|
---|
10 | # 3) http://vancouver-webpages.com/META/metatags.detail.html
|
---|
11 | # 4) http://www.html-reference.com/META.htm
|
---|
12 | #------------------------------------------------------------------------------
|
---|
13 |
|
---|
14 | package Image::ExifTool::HTML;
|
---|
15 |
|
---|
16 | use strict;
|
---|
17 | use vars qw($VERSION @ISA @EXPORT_OK);
|
---|
18 | use Image::ExifTool qw(:DataAccess :Utils);
|
---|
19 | use Image::ExifTool::PostScript;
|
---|
20 | use Image::ExifTool::XMP qw(EscapeXML UnescapeXML);
|
---|
21 | require Exporter;
|
---|
22 |
|
---|
23 | $VERSION = '1.11';
|
---|
24 | @ISA = qw(Exporter);
|
---|
25 | @EXPORT_OK = qw(EscapeHTML UnescapeHTML);
|
---|
26 |
|
---|
27 | sub SetHTMLCharset($$);
|
---|
28 |
|
---|
29 | # convert HTML charset (lower case) to ExifTool Charset name
|
---|
30 | my %htmlCharset = (
|
---|
31 | macintosh => 'MacRoman',
|
---|
32 | 'iso-8859-1' => 'Latin',
|
---|
33 | 'utf-8' => 'UTF8',
|
---|
34 | 'windows-1252' => 'Latin',
|
---|
35 | );
|
---|
36 |
|
---|
37 | # HTML 4 character entity references
|
---|
38 | my %entityNum = (
|
---|
39 | 'quot' => 34, 'eth' => 240, 'lsquo' => 8216,
|
---|
40 | 'amp' => 38, 'ntilde' => 241, 'rsquo' => 8217,
|
---|
41 | 'apos' => 39, 'ograve' => 242, 'sbquo' => 8218,
|
---|
42 | 'lt' => 60, 'oacute' => 243, 'ldquo' => 8220,
|
---|
43 | 'gt' => 62, 'ocirc' => 244, 'rdquo' => 8221,
|
---|
44 | 'nbsp' => 160, 'otilde' => 245, 'bdquo' => 8222,
|
---|
45 | 'iexcl' => 161, 'ouml' => 246, 'dagger' => 8224,
|
---|
46 | 'cent' => 162, 'divide' => 247, 'Dagger' => 8225,
|
---|
47 | 'pound' => 163, 'oslash' => 248, 'bull' => 8226,
|
---|
48 | 'curren' => 164, 'ugrave' => 249, 'hellip' => 8230,
|
---|
49 | 'yen' => 165, 'uacute' => 250, 'permil' => 8240,
|
---|
50 | 'brvbar' => 166, 'ucirc' => 251, 'prime' => 8242,
|
---|
51 | 'sect' => 167, 'uuml' => 252, 'Prime' => 8243,
|
---|
52 | 'uml' => 168, 'yacute' => 253, 'lsaquo' => 8249,
|
---|
53 | 'copy' => 169, 'thorn' => 254, 'rsaquo' => 8250,
|
---|
54 | 'ordf' => 170, 'yuml' => 255, 'oline' => 8254,
|
---|
55 | 'laquo' => 171, 'OElig' => 338, 'frasl' => 8260,
|
---|
56 | 'not' => 172, 'oelig' => 339, 'euro' => 8364,
|
---|
57 | 'shy' => 173, 'Scaron' => 352, 'image' => 8465,
|
---|
58 | 'reg' => 174, 'scaron' => 353, 'weierp' => 8472,
|
---|
59 | 'macr' => 175, 'Yuml' => 376, 'real' => 8476,
|
---|
60 | 'deg' => 176, 'fnof' => 402, 'trade' => 8482,
|
---|
61 | 'plusmn' => 177, 'circ' => 710, 'alefsym'=> 8501,
|
---|
62 | 'sup2' => 178, 'tilde' => 732, 'larr' => 8592,
|
---|
63 | 'sup3' => 179, 'Alpha' => 913, 'uarr' => 8593,
|
---|
64 | 'acute' => 180, 'Beta' => 914, 'rarr' => 8594,
|
---|
65 | 'micro' => 181, 'Gamma' => 915, 'darr' => 8595,
|
---|
66 | 'para' => 182, 'Delta' => 916, 'harr' => 8596,
|
---|
67 | 'middot' => 183, 'Epsilon'=> 917, 'crarr' => 8629,
|
---|
68 | 'cedil' => 184, 'Zeta' => 918, 'lArr' => 8656,
|
---|
69 | 'sup1' => 185, 'Eta' => 919, 'uArr' => 8657,
|
---|
70 | 'ordm' => 186, 'Theta' => 920, 'rArr' => 8658,
|
---|
71 | 'raquo' => 187, 'Iota' => 921, 'dArr' => 8659,
|
---|
72 | 'frac14' => 188, 'Kappa' => 922, 'hArr' => 8660,
|
---|
73 | 'frac12' => 189, 'Lambda' => 923, 'forall' => 8704,
|
---|
74 | 'frac34' => 190, 'Mu' => 924, 'part' => 8706,
|
---|
75 | 'iquest' => 191, 'Nu' => 925, 'exist' => 8707,
|
---|
76 | 'Agrave' => 192, 'Xi' => 926, 'empty' => 8709,
|
---|
77 | 'Aacute' => 193, 'Omicron'=> 927, 'nabla' => 8711,
|
---|
78 | 'Acirc' => 194, 'Pi' => 928, 'isin' => 8712,
|
---|
79 | 'Atilde' => 195, 'Rho' => 929, 'notin' => 8713,
|
---|
80 | 'Auml' => 196, 'Sigma' => 931, 'ni' => 8715,
|
---|
81 | 'Aring' => 197, 'Tau' => 932, 'prod' => 8719,
|
---|
82 | 'AElig' => 198, 'Upsilon'=> 933, 'sum' => 8721,
|
---|
83 | 'Ccedil' => 199, 'Phi' => 934, 'minus' => 8722,
|
---|
84 | 'Egrave' => 200, 'Chi' => 935, 'lowast' => 8727,
|
---|
85 | 'Eacute' => 201, 'Psi' => 936, 'radic' => 8730,
|
---|
86 | 'Ecirc' => 202, 'Omega' => 937, 'prop' => 8733,
|
---|
87 | 'Euml' => 203, 'alpha' => 945, 'infin' => 8734,
|
---|
88 | 'Igrave' => 204, 'beta' => 946, 'ang' => 8736,
|
---|
89 | 'Iacute' => 205, 'gamma' => 947, 'and' => 8743,
|
---|
90 | 'Icirc' => 206, 'delta' => 948, 'or' => 8744,
|
---|
91 | 'Iuml' => 207, 'epsilon'=> 949, 'cap' => 8745,
|
---|
92 | 'ETH' => 208, 'zeta' => 950, 'cup' => 8746,
|
---|
93 | 'Ntilde' => 209, 'eta' => 951, 'int' => 8747,
|
---|
94 | 'Ograve' => 210, 'theta' => 952, 'there4' => 8756,
|
---|
95 | 'Oacute' => 211, 'iota' => 953, 'sim' => 8764,
|
---|
96 | 'Ocirc' => 212, 'kappa' => 954, 'cong' => 8773,
|
---|
97 | 'Otilde' => 213, 'lambda' => 955, 'asymp' => 8776,
|
---|
98 | 'Ouml' => 214, 'mu' => 956, 'ne' => 8800,
|
---|
99 | 'times' => 215, 'nu' => 957, 'equiv' => 8801,
|
---|
100 | 'Oslash' => 216, 'xi' => 958, 'le' => 8804,
|
---|
101 | 'Ugrave' => 217, 'omicron'=> 959, 'ge' => 8805,
|
---|
102 | 'Uacute' => 218, 'pi' => 960, 'sub' => 8834,
|
---|
103 | 'Ucirc' => 219, 'rho' => 961, 'sup' => 8835,
|
---|
104 | 'Uuml' => 220, 'sigmaf' => 962, 'nsub' => 8836,
|
---|
105 | 'Yacute' => 221, 'sigma' => 963, 'sube' => 8838,
|
---|
106 | 'THORN' => 222, 'tau' => 964, 'supe' => 8839,
|
---|
107 | 'szlig' => 223, 'upsilon'=> 965, 'oplus' => 8853,
|
---|
108 | 'agrave' => 224, 'phi' => 966, 'otimes' => 8855,
|
---|
109 | 'aacute' => 225, 'chi' => 967, 'perp' => 8869,
|
---|
110 | 'acirc' => 226, 'psi' => 968, 'sdot' => 8901,
|
---|
111 | 'atilde' => 227, 'omega' => 969, 'lceil' => 8968,
|
---|
112 | 'auml' => 228, 'thetasym'=>977, 'rceil' => 8969,
|
---|
113 | 'aring' => 229, 'upsih' => 978, 'lfloor' => 8970,
|
---|
114 | 'aelig' => 230, 'piv' => 982, 'rfloor' => 8971,
|
---|
115 | 'ccedil' => 231, 'ensp' => 8194, 'lang' => 9001,
|
---|
116 | 'egrave' => 232, 'emsp' => 8195, 'rang' => 9002,
|
---|
117 | 'eacute' => 233, 'thinsp' => 8201, 'loz' => 9674,
|
---|
118 | 'ecirc' => 234, 'zwnj' => 8204, 'spades' => 9824,
|
---|
119 | 'euml' => 235, 'zwj' => 8205, 'clubs' => 9827,
|
---|
120 | 'igrave' => 236, 'lrm' => 8206, 'hearts' => 9829,
|
---|
121 | 'iacute' => 237, 'rlm' => 8207, 'diams' => 9830,
|
---|
122 | 'icirc' => 238, 'ndash' => 8211,
|
---|
123 | 'iuml' => 239, 'mdash' => 8212,
|
---|
124 | );
|
---|
125 | my %entityName; # look up entity names by number (built as necessary)
|
---|
126 |
|
---|
127 | # HTML info
|
---|
128 | # (tag ID's are case insensitive and must be all lower case in tables)
|
---|
129 | %Image::ExifTool::HTML::Main = (
|
---|
130 | GROUPS => { 2 => 'Document' },
|
---|
131 | NOTES => q{
|
---|
132 | Meta information extracted from the header of HTML and XHTML files. This is
|
---|
133 | a mix of information found in the C<META> elements, C<XML> element, and the
|
---|
134 | C<TITLE> element.
|
---|
135 | },
|
---|
136 | dc => {
|
---|
137 | Name => 'DC',
|
---|
138 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::dc' },
|
---|
139 | },
|
---|
140 | ncc => {
|
---|
141 | Name => 'NCC',
|
---|
142 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::ncc' },
|
---|
143 | },
|
---|
144 | prod => {
|
---|
145 | Name => 'Prod',
|
---|
146 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::prod' },
|
---|
147 | },
|
---|
148 | vw96 => {
|
---|
149 | Name => 'VW96',
|
---|
150 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::vw96' },
|
---|
151 | },
|
---|
152 | 'http-equiv' => {
|
---|
153 | Name => 'HTTP-equiv',
|
---|
154 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::equiv' },
|
---|
155 | },
|
---|
156 | o => {
|
---|
157 | Name => 'Office',
|
---|
158 | SubDirectory => { TagTable => 'Image::ExifTool::HTML::Office' },
|
---|
159 | },
|
---|
160 | abstract => { },
|
---|
161 | author => { },
|
---|
162 | classification => { },
|
---|
163 | copyright => { },
|
---|
164 | description => { },
|
---|
165 | distribution => { },
|
---|
166 | 'doc-class' => { Name => 'DocClass' },
|
---|
167 | 'doc-rights' => { Name => 'DocRights' },
|
---|
168 | 'doc-type' => { Name => 'DocType' },
|
---|
169 | formatter => { },
|
---|
170 | generator => { },
|
---|
171 | googlebot => { Name => 'GoogleBot' },
|
---|
172 | keywords => { List => 1 },
|
---|
173 | mssmarttagspreventparsing => { Name => 'NoMSSmartTags' },
|
---|
174 | originator => { },
|
---|
175 | owner => { },
|
---|
176 | progid => { Name => 'ProgID' },
|
---|
177 | rating => { },
|
---|
178 | refresh => { },
|
---|
179 | 'resource-type' => { Name => 'ResourceType' },
|
---|
180 | 'revisit-after' => { Name => 'RevisitAfter' },
|
---|
181 | robots => { List => 1 },
|
---|
182 | title => { Notes => "the only extracted tag which isn't from an HTML META element" },
|
---|
183 | );
|
---|
184 |
|
---|
185 | # ref 2
|
---|
186 | %Image::ExifTool::HTML::dc = (
|
---|
187 | GROUPS => { 1 => 'HTML-dc', 2 => 'Document' },
|
---|
188 | NOTES => 'Dublin Core schema tags (also used in XMP).',
|
---|
189 | contributor => { Groups => { 2 => 'Author' }, List => 'Bag' },
|
---|
190 | coverage => { },
|
---|
191 | creator => { Groups => { 2 => 'Author' }, List => 'Seq' },
|
---|
192 | date => {
|
---|
193 | Groups => { 2 => 'Time' },
|
---|
194 | List => 'Seq',
|
---|
195 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
196 | },
|
---|
197 | description => { },
|
---|
198 | 'format' => { },
|
---|
199 | identifier => { },
|
---|
200 | language => { List => 'Bag' },
|
---|
201 | publisher => { Groups => { 2 => 'Author' }, List => 'Bag' },
|
---|
202 | relation => { List => 'Bag' },
|
---|
203 | rights => { Groups => { 2 => 'Author' } },
|
---|
204 | source => { Groups => { 2 => 'Author' } },
|
---|
205 | subject => { List => 'Bag' },
|
---|
206 | title => { },
|
---|
207 | type => { List => 'Bag' },
|
---|
208 | );
|
---|
209 |
|
---|
210 | # ref 2
|
---|
211 | %Image::ExifTool::HTML::ncc = (
|
---|
212 | GROUPS => { 1 => 'HTML-ncc', 2 => 'Document' },
|
---|
213 | charset => { Name => 'CharacterSet' }, # name changed to avoid conflict with -charset option
|
---|
214 | depth => { },
|
---|
215 | files => { },
|
---|
216 | footnotes => { },
|
---|
217 | generator => { },
|
---|
218 | kbytesize => { Name => 'KByteSize' },
|
---|
219 | maxpagenormal => { Name => 'MaxPageNormal' },
|
---|
220 | multimediatype => { Name => 'MultimediaType' },
|
---|
221 | narrator => { },
|
---|
222 | pagefront => { Name => 'PageFront' },
|
---|
223 | pagenormal => { Name => 'PageNormal' },
|
---|
224 | pagespecial => { Name => 'PageSpecial' },
|
---|
225 | prodnotes => { Name => 'ProdNotes' },
|
---|
226 | producer => { },
|
---|
227 | produceddate => { Name => 'ProducedDate', Groups => { 2 => 'Time' } }, # YYYY-mm-dd
|
---|
228 | revision => { },
|
---|
229 | revisiondate => { Name => 'RevisionDate', Groups => { 2 => 'Time' } },
|
---|
230 | setinfo => { Name => 'SetInfo' },
|
---|
231 | sidebars => { },
|
---|
232 | sourcedate => { Name => 'SourceDate', Groups => { 2 => 'Time' } },
|
---|
233 | sourceedition => { Name => 'SourceEdition' },
|
---|
234 | sourcepublisher => { Name => 'SourcePublisher' },
|
---|
235 | sourcerights => { Name => 'SourceRights' },
|
---|
236 | sourcetitle => { Name => 'SourceTitle' },
|
---|
237 | tocitems => { Name => 'TOCItems' },
|
---|
238 | totaltime => { Name => 'Duration' }, # HH:MM:SS
|
---|
239 | );
|
---|
240 |
|
---|
241 | # ref 3
|
---|
242 | %Image::ExifTool::HTML::vw96 = (
|
---|
243 | GROUPS => { 1 => 'HTML-vw96', 2 => 'Document' },
|
---|
244 | objecttype => { Name => 'ObjectType' },
|
---|
245 | );
|
---|
246 |
|
---|
247 | # ref 2
|
---|
248 | %Image::ExifTool::HTML::prod = (
|
---|
249 | GROUPS => { 1 => 'HTML-prod', 2 => 'Document' },
|
---|
250 | reclocation => { Name => 'RecLocation' },
|
---|
251 | recengineer => { Name => 'RecEngineer' },
|
---|
252 | );
|
---|
253 |
|
---|
254 | # ref 3/4
|
---|
255 | %Image::ExifTool::HTML::equiv = (
|
---|
256 | GROUPS => { 1 => 'HTTP-equiv', 2 => 'Document' },
|
---|
257 | NOTES => 'These tags have a family 1 group name of "HTTP-equiv".',
|
---|
258 | 'cache-control' => { Name => 'CacheControl' },
|
---|
259 | 'content-disposition' => { Name => 'ContentDisposition' },
|
---|
260 | 'content-language' => { Name => 'ContentLanguage' },
|
---|
261 | 'content-script-type' => { Name => 'ContentScriptType' },
|
---|
262 | 'content-style-type' => { Name => 'ContentStyleType' },
|
---|
263 | # note: setting the HTMLCharset like this will miss any tags which come earlier
|
---|
264 | 'content-type' => { Name => 'ContentType', RawConv => \&SetHTMLCharset },
|
---|
265 | 'default-style' => { Name => 'DefaultStyle' },
|
---|
266 | expires => { },
|
---|
267 | 'ext-cache' => { Name => 'ExtCache' },
|
---|
268 | imagetoolbar => { Name => 'ImageToolbar' },
|
---|
269 | lotus => { },
|
---|
270 | 'page-enter' => { Name => 'PageEnter' },
|
---|
271 | 'page-exit' => { Name => 'PageExit' },
|
---|
272 | 'pics-label' => { Name => 'PicsLabel' },
|
---|
273 | pragma => { },
|
---|
274 | refresh => { },
|
---|
275 | 'reply-to' => { Name => 'ReplyTo' },
|
---|
276 | 'set-cookie' => { Name => 'SetCookie' },
|
---|
277 | 'site-enter' => { Name => 'SiteEnter' },
|
---|
278 | 'site-exit' => { Name => 'SiteExit' },
|
---|
279 | vary => { },
|
---|
280 | 'window-target' => { Name => 'WindowTarget' },
|
---|
281 | );
|
---|
282 |
|
---|
283 | # MS Office namespace (ref PH)
|
---|
284 | %Image::ExifTool::HTML::Office = (
|
---|
285 | GROUPS => { 1 => 'HTML-office', 2 => 'Document' },
|
---|
286 | NOTES => 'Tags written by Microsoft Office applications.',
|
---|
287 | Subject => { },
|
---|
288 | Author => { Groups => { 2 => 'Author' } },
|
---|
289 | Keywords => { },
|
---|
290 | Description => { },
|
---|
291 | Template => { },
|
---|
292 | LastAuthor => { Groups => { 2 => 'Author' } },
|
---|
293 | Revision => { Name => 'RevisionNumber' },
|
---|
294 | TotalTime => { Name => 'TotalEditTime', PrintConv => 'ConvertTimeSpan($val, 60)' },
|
---|
295 | Created => {
|
---|
296 | Name => 'CreateDate',
|
---|
297 | Groups => { 2 => 'Time' },
|
---|
298 | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
|
---|
299 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
300 | },
|
---|
301 | LastSaved => {
|
---|
302 | Name => 'ModifyDate',
|
---|
303 | Groups => { 2 => 'Time' },
|
---|
304 | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
|
---|
305 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
306 | },
|
---|
307 | LastSaved => {
|
---|
308 | Name => 'ModifyDate',
|
---|
309 | Groups => { 2 => 'Time' },
|
---|
310 | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
|
---|
311 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
312 | },
|
---|
313 | LastPrinted => {
|
---|
314 | Name => 'LastPrinted',
|
---|
315 | Groups => { 2 => 'Time' },
|
---|
316 | ValueConv => 'Image::ExifTool::XMP::ConvertXMPDate($val)',
|
---|
317 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
318 | },
|
---|
319 | Pages => { },
|
---|
320 | Words => { },
|
---|
321 | Characters => { },
|
---|
322 | Category => { },
|
---|
323 | Manager => { },
|
---|
324 | Company => { },
|
---|
325 | Lines => { },
|
---|
326 | Paragraphs => { },
|
---|
327 | CharactersWithSpaces => { },
|
---|
328 | Version => { Name => 'RevisionNumber' },
|
---|
329 | );
|
---|
330 |
|
---|
331 | #------------------------------------------------------------------------------
|
---|
332 | # Set HTMLCharset member based on content type
|
---|
333 | # Inputs: 0) content type string, 1) ExifTool ref
|
---|
334 | # Returns: original string
|
---|
335 | sub SetHTMLCharset($$)
|
---|
336 | {
|
---|
337 | my ($val, $exifTool) = @_;
|
---|
338 | $$exifTool{HTMLCharset} = $htmlCharset{lc $1} if $val =~ /charset=['"]?([-\w]+)/;
|
---|
339 | return $val;
|
---|
340 | }
|
---|
341 |
|
---|
342 | #------------------------------------------------------------------------------
|
---|
343 | # Convert single UTF-8 character to HTML character reference
|
---|
344 | # Inputs: 0) UTF-8 character sequence
|
---|
345 | # Returns: HTML character reference (ie. """);
|
---|
346 | # Note: Must be called via EscapeHTML to load name lookup
|
---|
347 | sub EscapeChar($)
|
---|
348 | {
|
---|
349 | my $ch = shift;
|
---|
350 | my $val;
|
---|
351 | if ($] < 5.006001) {
|
---|
352 | ($val) = Image::ExifTool::UnpackUTF8($ch);
|
---|
353 | } else {
|
---|
354 | # the meaning of "U0" is reversed as of Perl 5.10.0!
|
---|
355 | ($val) = unpack($] < 5.010000 ? 'U0U' : 'C0U', $ch);
|
---|
356 | }
|
---|
357 | return '?' unless defined $val;
|
---|
358 | return "&$entityName{$val};" if $entityName{$val};
|
---|
359 | return sprintf('&#x%x;',$val);
|
---|
360 | }
|
---|
361 |
|
---|
362 | #------------------------------------------------------------------------------
|
---|
363 | # Escape any special characters for HTML
|
---|
364 | # Inputs: 0) UTF-8 string to be escaped
|
---|
365 | # Returns: escaped string
|
---|
366 | sub EscapeHTML($)
|
---|
367 | {
|
---|
368 | my $str = shift;
|
---|
369 | # escape XML characters
|
---|
370 | $str = EscapeXML($str);
|
---|
371 | # escape other special characters if they exist
|
---|
372 | if ($str =~ /[\x80-\xff]/) {
|
---|
373 | # generate entity name lookup if necessary
|
---|
374 | unless (%entityName) {
|
---|
375 | local $_;
|
---|
376 | foreach (keys %entityNum) {
|
---|
377 | $entityName{$entityNum{$_}} = $_;
|
---|
378 | }
|
---|
379 | delete $entityName{39}; # 'apos' is not valid HTML
|
---|
380 | }
|
---|
381 | # suppress warnings
|
---|
382 | local $SIG{'__WARN__'} = sub { 1 };
|
---|
383 | # escape any non-ascii characters for HTML
|
---|
384 | $str =~ s/([\xc2-\xf7][\x80-\xbf]+)/EscapeChar($1)/sge;
|
---|
385 | }
|
---|
386 | return $str;
|
---|
387 | }
|
---|
388 |
|
---|
389 | #------------------------------------------------------------------------------
|
---|
390 | # Unescape all HTML character references
|
---|
391 | # Inputs: 0) string to be unescaped
|
---|
392 | # Returns: unescaped string
|
---|
393 | sub UnescapeHTML($)
|
---|
394 | {
|
---|
395 | return UnescapeXML(shift, \%entityNum);
|
---|
396 | }
|
---|
397 |
|
---|
398 | #------------------------------------------------------------------------------
|
---|
399 | # Extract information from a HTML file
|
---|
400 | # Inputs: 0) ExifTool object reference, 1) DirInfo reference
|
---|
401 | # Returns: 1 on success, 0 if this wasn't a valid HTML file
|
---|
402 | sub ProcessHTML($$)
|
---|
403 | {
|
---|
404 | my ($exifTool, $dirInfo) = @_;
|
---|
405 | my $raf = $$dirInfo{RAF};
|
---|
406 | my $buff;
|
---|
407 |
|
---|
408 | # validate HTML or XHTML file
|
---|
409 | $raf->Read($buff, 256) or return 0;
|
---|
410 | $buff =~ /^<(!DOCTYPE\s+HTML|HTML|\?xml)/i or return 0;
|
---|
411 | $buff =~ /<(!DOCTYPE\s+)?HTML/i or return 0 if $1 eq '?xml';
|
---|
412 | $exifTool->SetFileType();
|
---|
413 |
|
---|
414 | $raf->Seek(0,0) or $exifTool->Warn('Seek error'), return 1;
|
---|
415 |
|
---|
416 | local $/ = Image::ExifTool::PostScript::GetInputRecordSeparator($raf);
|
---|
417 | $/ or $exifTool->Warn('Invalid HTML data'), return 1;
|
---|
418 |
|
---|
419 | # extract header information
|
---|
420 | my $doc;
|
---|
421 | while ($raf->ReadLine($buff)) {
|
---|
422 | if (not defined $doc) {
|
---|
423 | # look for 'head' element
|
---|
424 | next unless $buff =~ /<head\b/ig;
|
---|
425 | $doc = substr($buff, pos($buff));
|
---|
426 | next;
|
---|
427 | }
|
---|
428 | $doc .= $buff;
|
---|
429 | last if $buff =~ m{</head>}i;
|
---|
430 | }
|
---|
431 | return 1 unless defined $doc;
|
---|
432 |
|
---|
433 | # process all elements in header
|
---|
434 | my $tagTablePtr = GetTagTable('Image::ExifTool::HTML::Main');
|
---|
435 | for (;;) {
|
---|
436 | last unless $doc =~ m{<([\w:.-]+)(.*?)>}sg;
|
---|
437 | my ($tagName, $attrs) = ($1, $2);
|
---|
438 | my $tag = lc($tagName);
|
---|
439 | my ($val, $grp);
|
---|
440 | unless ($attrs =~ m{/$}) { # self-contained XHTML tags end in '/>'
|
---|
441 | # look for element close
|
---|
442 | my $pos = pos($doc);
|
---|
443 | my $close = "</$tagName>";
|
---|
444 | # the following doesn't work on Solaris Perl 5.6.1 due to Perl bug:
|
---|
445 | # if ($doc =~ m{(.*?)</$tagName>}sg) {
|
---|
446 | # $val = $1;
|
---|
447 | if ($doc =~ m{$close}sg) {
|
---|
448 | $val = substr($doc, $pos, pos($doc)-$pos-length($close));
|
---|
449 | } else {
|
---|
450 | pos($doc) = $pos;
|
---|
451 | next unless $tag eq 'meta'; # META tags don't need to be closed
|
---|
452 | }
|
---|
453 | }
|
---|
454 | my $table = $tagTablePtr;
|
---|
455 | if ($tag eq 'meta') {
|
---|
456 | # parse HTML META element
|
---|
457 | undef $tag;
|
---|
458 | # tag name is in NAME or HTTP-EQUIV attribute
|
---|
459 | if ($attrs =~ /name=['"]?([\w:.-]+)/si) {
|
---|
460 | $tagName = $1;
|
---|
461 | } elsif ($attrs =~ /http-equiv=['"]?([\w:.-]+)/si) {
|
---|
462 | $tagName = "HTTP-equiv.$1";
|
---|
463 | } else {
|
---|
464 | next; # no name
|
---|
465 | }
|
---|
466 | $tag = lc($tagName);
|
---|
467 | # tag value is in CONTENT attribute
|
---|
468 | $val = $2 if $attrs =~ /content=(['"])(.*?)\1/si;
|
---|
469 | next unless $tag and defined $val;
|
---|
470 | # isolate group name (separator is '.' in HTML, but ':' in ref 2)
|
---|
471 | if ($tag =~ /^([\w-]+)[:.]([\w-]+)/) {
|
---|
472 | ($grp, $tag) = ($1, $2);
|
---|
473 | my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp);
|
---|
474 | if ($tagInfo and $$tagInfo{SubDirectory}) {
|
---|
475 | $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
476 | } else {
|
---|
477 | $tag = "$grp.$tag";
|
---|
478 | }
|
---|
479 | }
|
---|
480 | } elsif ($tag eq 'xml') {
|
---|
481 | $exifTool->VPrint(0, "Parsing XML\n");
|
---|
482 | # parse XML tags (quick-and-dirty)
|
---|
483 | my $xml = $val;
|
---|
484 | while ($xml =~ /<([\w-]+):([\w-]+)(\s.*?)?>([^<]*?)<\/\1:\2>/g) {
|
---|
485 | ($grp, $tag, $val) = ($1, $2, $4);
|
---|
486 | my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $grp);
|
---|
487 | next unless $tagInfo and $$tagInfo{SubDirectory};
|
---|
488 | $table = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
489 | unless ($$table{$tag}) {
|
---|
490 | my $name = ucfirst $tag;
|
---|
491 | $name =~ s/_x([0-9a-f]{4})_/chr(hex($1))/gie; # convert hex codes
|
---|
492 | $name =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
|
---|
493 | $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters (also hex code wide chars)
|
---|
494 | Image::ExifTool::AddTagToTable($table, $tag, { Name => $name });
|
---|
495 | $exifTool->VPrint(0, " [adding $tag '$name']\n");
|
---|
496 | }
|
---|
497 | $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset};
|
---|
498 | $exifTool->HandleTag($table, $tag, UnescapeXML($val));
|
---|
499 | }
|
---|
500 | next;
|
---|
501 | } else {
|
---|
502 | # the only other element we process is TITLE
|
---|
503 | next unless $tag eq 'title';
|
---|
504 | }
|
---|
505 | unless ($$table{$tag}) {
|
---|
506 | my $name = $tagName;
|
---|
507 | $name =~ s/\W+(\w)/\u$1/sg;
|
---|
508 | my $info = { Name => $name, Groups => { 0 => 'HTML' } };
|
---|
509 | $info->{Groups}->{1} = ($grp eq 'http-equiv' ? 'HTTP-equiv' : "HTML-$grp") if $grp;
|
---|
510 | Image::ExifTool::AddTagToTable($table, $tag, $info);
|
---|
511 | $exifTool->VPrint(0, " [adding $tag '$tagName']\n");
|
---|
512 | }
|
---|
513 | # recode if necessary
|
---|
514 | $val = $exifTool->Decode($val, $$exifTool{HTMLCharset}) if $$exifTool{HTMLCharset};
|
---|
515 | $val =~ s{\s*$/\s*}{ }sg; # replace linefeeds and indenting spaces
|
---|
516 | $val = UnescapeHTML($val); # unescape HTML character references
|
---|
517 | $exifTool->HandleTag($table, $tag, $val);
|
---|
518 | }
|
---|
519 | return 1;
|
---|
520 | }
|
---|
521 |
|
---|
522 | 1; # end
|
---|
523 |
|
---|
524 | __END__
|
---|
525 |
|
---|
526 | =head1 NAME
|
---|
527 |
|
---|
528 | Image::ExifTool::HTML - Read HTML meta information
|
---|
529 |
|
---|
530 | =head1 SYNOPSIS
|
---|
531 |
|
---|
532 | This module is used by Image::ExifTool
|
---|
533 |
|
---|
534 | =head1 DESCRIPTION
|
---|
535 |
|
---|
536 | This module contains routines required by Image::ExifTool to extract
|
---|
537 | meta information from HTML documents.
|
---|
538 |
|
---|
539 | =head1 AUTHOR
|
---|
540 |
|
---|
541 | Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
|
---|
542 |
|
---|
543 | This library is free software; you can redistribute it and/or modify it
|
---|
544 | under the same terms as Perl itself.
|
---|
545 |
|
---|
546 | =head1 REFERENCES
|
---|
547 |
|
---|
548 | =over 4
|
---|
549 |
|
---|
550 | =item L<http://www.w3.org/TR/html4/>
|
---|
551 |
|
---|
552 | =item L<http://www.daisy.org/publications/specifications/daisy_202.html>
|
---|
553 |
|
---|
554 | =item L<http://vancouver-webpages.com/META/metatags.detail.html>
|
---|
555 |
|
---|
556 | =item L<http://www.html-reference.com/META.htm>
|
---|
557 |
|
---|
558 | =back
|
---|
559 |
|
---|
560 | =head1 SEE ALSO
|
---|
561 |
|
---|
562 | L<Image::ExifTool::TagNames/HTML Tags>,
|
---|
563 | L<Image::ExifTool(3pm)|Image::ExifTool>
|
---|
564 |
|
---|
565 | =cut
|
---|
566 |
|
---|