source: main/trunk/greenstone2/perllib/cpan/Mojo/DOM/CSS.pm@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File size: 17.1 KB
Line 
1package Mojo::DOM::CSS;
2use Mojo::Base -base;
3
4use Mojo::Util 'trim';
5
6has 'tree';
7
8my $ESCAPE_RE = qr/\\[^0-9a-fA-F]|\\[0-9a-fA-F]{1,6}/;
9my $ATTR_RE = qr/
10 \[
11 ((?:$ESCAPE_RE|[\w\-])+) # Key
12 (?:
13 (\W)?= # Operator
14 (?:"((?:\\"|[^"])*)"|'((?:\\'|[^'])*)'|([^\]]+?)) # Value
15 (?:\s+(i))? # Case-sensitivity
16 )?
17 \]
18/x;
19
20sub matches {
21 my $tree = shift->tree;
22 return $tree->[0] ne 'tag' ? undef : _match(_compile(@_), $tree, $tree);
23}
24
25sub select { _select(0, shift->tree, _compile(@_)) }
26sub select_one { _select(1, shift->tree, _compile(@_)) }
27
28sub _ancestor {
29 my ($selectors, $current, $tree, $one, $pos) = @_;
30
31 while ($current = $current->[3]) {
32 return undef if $current->[0] eq 'root' || $current eq $tree;
33 return 1 if _combinator($selectors, $current, $tree, $pos);
34 last if $one;
35 }
36
37 return undef;
38}
39
40sub _attr {
41 my ($name_re, $value_re, $current) = @_;
42
43 my $attrs = $current->[2];
44 for my $name (keys %$attrs) {
45 my $value = $attrs->{$name};
46 next if $name !~ $name_re || (!defined $value && defined $value_re);
47 return 1 if !(defined $value && defined $value_re) || $value =~ $value_re;
48 }
49
50 return undef;
51}
52
53sub _combinator {
54 my ($selectors, $current, $tree, $pos) = @_;
55
56 # Selector
57 return undef unless my $c = $selectors->[$pos];
58 if (ref $c) {
59 return undef unless _selector($c, $current);
60 return 1 unless $c = $selectors->[++$pos];
61 }
62
63 # ">" (parent only)
64 return _ancestor($selectors, $current, $tree, 1, ++$pos) if $c eq '>';
65
66 # "~" (preceding siblings)
67 return _sibling($selectors, $current, $tree, 0, ++$pos) if $c eq '~';
68
69 # "+" (immediately preceding siblings)
70 return _sibling($selectors, $current, $tree, 1, ++$pos) if $c eq '+';
71
72 # " " (ancestor)
73 return _ancestor($selectors, $current, $tree, 0, ++$pos);
74}
75
76sub _compile {
77 my ($css, %ns) = (trim('' . shift), @_);
78
79 my $group = [[]];
80 while (my $selectors = $group->[-1]) {
81 push @$selectors, [] unless @$selectors && ref $selectors->[-1];
82 my $last = $selectors->[-1];
83
84 # Separator
85 if ($css =~ /\G\s*,\s*/gc) { push @$group, [] }
86
87 # Combinator
88 elsif ($css =~ /\G\s*([ >+~])\s*/gc) { push @$selectors, $1 }
89
90 # Class or ID
91 elsif ($css =~ /\G([.#])((?:$ESCAPE_RE\s|\\.|[^,.#:[ >~+])+)/gco) {
92 my ($name, $op) = $1 eq '.' ? ('class', '~') : ('id', '');
93 push @$last, ['attr', _name($name), _value($op, $2)];
94 }
95
96 # Attributes
97 elsif ($css =~ /\G$ATTR_RE/gco) {
98 push @$last, ['attr', _name($1), _value($2 // '', $3 // $4 // $5, $6)];
99 }
100
101 # Pseudo-class
102 elsif ($css =~ /\G:([\w\-]+)(?:\(((?:\([^)]+\)|[^)])+)\))?/gcs) {
103 my ($name, $args) = (lc $1, $2);
104
105 # ":matches" and ":not" (contains more selectors)
106 $args = _compile($args, %ns) if $name eq 'matches' || $name eq 'not';
107
108 # ":nth-*" (with An+B notation)
109 $args = _equation($args) if $name =~ /^nth-/;
110
111 # ":first-*" (rewrite to ":nth-*")
112 ($name, $args) = ("nth-$1", [0, 1]) if $name =~ /^first-(.+)$/;
113
114 # ":last-*" (rewrite to ":nth-*")
115 ($name, $args) = ("nth-$name", [-1, 1]) if $name =~ /^last-/;
116
117 push @$last, ['pc', $name, $args];
118 }
119
120 # Tag
121 elsif ($css =~ /\G((?:$ESCAPE_RE\s|\\.|[^,.#:[ >~+])+)/gco) {
122 my $alias = (my $name = $1) =~ s/^([^|]*)\|// && $1 ne '*' ? $1 : undef;
123 my $ns = length $alias ? $ns{$alias} // return [['invalid']] : $alias;
124 push @$last, ['tag', $name eq '*' ? undef : _name($name), _unescape($ns)];
125 }
126
127 else {last}
128 }
129
130 return $group;
131}
132
133sub _empty { $_[0][0] eq 'comment' || $_[0][0] eq 'pi' }
134
135sub _equation {
136 return [0, 0] unless my $equation = shift;
137
138 # "even"
139 return [2, 2] if $equation =~ /^\s*even\s*$/i;
140
141 # "odd"
142 return [2, 1] if $equation =~ /^\s*odd\s*$/i;
143
144 # "4", "+4" or "-4"
145 return [0, $1] if $equation =~ /^\s*((?:\+|-)?\d+)\s*$/;
146
147 # "n", "4n", "+4n", "-4n", "n+1", "4n-1", "+4n-1" (and other variations)
148 return [0, 0]
149 unless $equation =~ /^\s*((?:\+|-)?(?:\d+)?)?n\s*((?:\+|-)\s*\d+)?\s*$/i;
150 return [$1 eq '-' ? -1 : !length $1 ? 1 : $1, join('', split(' ', $2 // 0))];
151}
152
153sub _match {
154 my ($group, $current, $tree) = @_;
155 _combinator([reverse @$_], $current, $tree, 0) and return 1 for @$group;
156 return undef;
157}
158
159sub _name {qr/(?:^|:)\Q@{[_unescape(shift)]}\E$/}
160
161sub _namespace {
162 my ($ns, $current) = @_;
163
164 my $attr = $current->[1] =~ /^([^:]+):/ ? "xmlns:$1" : 'xmlns';
165 while ($current) {
166 last if $current->[0] eq 'root';
167 return $current->[2]{$attr} eq $ns if exists $current->[2]{$attr};
168
169 $current = $current->[3];
170 }
171
172 # Failing to match yields true if searching for no namespace, false otherwise
173 return !length $ns;
174}
175
176sub _pc {
177 my ($class, $args, $current) = @_;
178
179 # ":checked"
180 return exists $current->[2]{checked} || exists $current->[2]{selected}
181 if $class eq 'checked';
182
183 # ":not"
184 return !_match($args, $current, $current) if $class eq 'not';
185
186 # ":matches"
187 return !!_match($args, $current, $current) if $class eq 'matches';
188
189 # ":empty"
190 return !grep { !_empty($_) } @$current[4 .. $#$current] if $class eq 'empty';
191
192 # ":root"
193 return $current->[3] && $current->[3][0] eq 'root' if $class eq 'root';
194
195 # ":link" and ":visited"
196 if ($class eq 'link' || $class eq 'visited') {
197 return undef unless $current->[0] eq 'tag' && exists $current->[2]{href};
198 return !!grep { $current->[1] eq $_ } qw(a area link);
199 }
200
201 # ":only-child" or ":only-of-type"
202 if ($class eq 'only-child' || $class eq 'only-of-type') {
203 my $type = $class eq 'only-of-type' ? $current->[1] : undef;
204 $_ ne $current and return undef for @{_siblings($current, $type)};
205 return 1;
206 }
207
208 # ":nth-child", ":nth-last-child", ":nth-of-type" or ":nth-last-of-type"
209 if (ref $args) {
210 my $type = $class eq 'nth-of-type'
211 || $class eq 'nth-last-of-type' ? $current->[1] : undef;
212 my @siblings = @{_siblings($current, $type)};
213 @siblings = reverse @siblings
214 if $class eq 'nth-last-child' || $class eq 'nth-last-of-type';
215
216 for my $i (0 .. $#siblings) {
217 next if (my $result = $args->[0] * $i + $args->[1]) < 1;
218 return undef unless my $sibling = $siblings[$result - 1];
219 return 1 if $sibling eq $current;
220 }
221 }
222
223 # Everything else
224 return undef;
225}
226
227sub _select {
228 my ($one, $tree, $group) = @_;
229
230 my @results;
231 my @queue = @$tree[($tree->[0] eq 'root' ? 1 : 4) .. $#$tree];
232 while (my $current = shift @queue) {
233 next unless $current->[0] eq 'tag';
234
235 unshift @queue, @$current[4 .. $#$current];
236 next unless _match($group, $current, $tree);
237 $one ? return $current : push @results, $current;
238 }
239
240 return $one ? undef : \@results;
241}
242
243sub _selector {
244 my ($selector, $current) = @_;
245
246 for my $s (@$selector) {
247 my $type = $s->[0];
248
249 # Tag
250 if ($type eq 'tag') {
251 return undef if defined $s->[1] && $current->[1] !~ $s->[1];
252 return undef if defined $s->[2] && !_namespace($s->[2], $current);
253 }
254
255 # Attribute
256 elsif ($type eq 'attr') { return undef unless _attr(@$s[1, 2], $current) }
257
258 # Pseudo-class
259 elsif ($type eq 'pc') { return undef unless _pc(@$s[1, 2], $current) }
260
261 # Invalid selector
262 else { return undef }
263 }
264
265 return 1;
266}
267
268sub _sibling {
269 my ($selectors, $current, $tree, $immediate, $pos) = @_;
270
271 my $found;
272 for my $sibling (@{_siblings($current)}) {
273 return $found if $sibling eq $current;
274
275 # "+" (immediately preceding sibling)
276 if ($immediate) { $found = _combinator($selectors, $sibling, $tree, $pos) }
277
278 # "~" (preceding sibling)
279 else { return 1 if _combinator($selectors, $sibling, $tree, $pos) }
280 }
281
282 return undef;
283}
284
285sub _siblings {
286 my ($current, $type) = @_;
287
288 my $parent = $current->[3];
289 my @siblings = grep { $_->[0] eq 'tag' }
290 @$parent[($parent->[0] eq 'root' ? 1 : 4) .. $#$parent];
291 @siblings = grep { $type eq $_->[1] } @siblings if defined $type;
292
293 return \@siblings;
294}
295
296sub _unescape {
297 return undef unless defined(my $value = shift);
298
299 # Remove escaped newlines
300 $value =~ s/\\\n//g;
301
302 # Unescape Unicode characters
303 $value =~ s/\\([0-9a-fA-F]{1,6})\s?/pack 'U', hex $1/ge;
304
305 # Remove backslash
306 $value =~ s/\\//g;
307
308 return $value;
309}
310
311sub _value {
312 my ($op, $value, $insensitive) = @_;
313 return undef unless defined $value;
314 $value = ($insensitive ? '(?i)' : '') . quotemeta _unescape($value);
315
316 # "~=" (word)
317 return qr/(?:^|\s+)$value(?:\s+|$)/ if $op eq '~';
318
319 # "|=" (hyphen-separated)
320 return qr/^$value(?:-|$)/ if $op eq '|';
321
322 # "*=" (contains)
323 return qr/$value/ if $op eq '*';
324
325 # "^=" (begins with)
326 return qr/^$value/ if $op eq '^';
327
328 # "$=" (ends with)
329 return qr/$value$/ if $op eq '$';
330
331 # Everything else
332 return qr/^$value$/;
333}
334
3351;
336
337=encoding utf8
338
339=head1 NAME
340
341Mojo::DOM::CSS - CSS selector engine
342
343=head1 SYNOPSIS
344
345 use Mojo::DOM::CSS;
346
347 # Select elements from DOM tree
348 my $css = Mojo::DOM::CSS->new(tree => $tree);
349 my $elements = $css->select('h1, h2, h3');
350
351=head1 DESCRIPTION
352
353L<Mojo::DOM::CSS> is the CSS selector engine used by L<Mojo::DOM>, based on the
354L<HTML Living Standard|https://html.spec.whatwg.org> and
355L<Selectors Level 3|http://www.w3.org/TR/css3-selectors/>.
356
357=head1 SELECTORS
358
359All CSS selectors that make sense for a standalone parser are supported.
360
361=head2 *
362
363Any element.
364
365 my $all = $css->select('*');
366
367=head2 E
368
369An element of type C<E>.
370
371 my $title = $css->select('title');
372
373=head2 E[foo]
374
375An C<E> element with a C<foo> attribute.
376
377 my $links = $css->select('a[href]');
378
379=head2 E[foo="bar"]
380
381An C<E> element whose C<foo> attribute value is exactly equal to C<bar>.
382
383 my $case_sensitive = $css->select('input[type="hidden"]');
384 my $case_sensitive = $css->select('input[type=hidden]');
385
386=head2 E[foo="bar" i]
387
388An C<E> element whose C<foo> attribute value is exactly equal to any
389(ASCII-range) case-permutation of C<bar>. Note that this selector is
390EXPERIMENTAL and might change without warning!
391
392 my $case_insensitive = $css->select('input[type="hidden" i]');
393 my $case_insensitive = $css->select('input[type=hidden i]');
394 my $case_insensitive = $css->select('input[class~="foo" i]');
395
396This selector is part of
397L<Selectors Level 4|http://dev.w3.org/csswg/selectors-4>, which is still a work
398in progress.
399
400=head2 E[foo~="bar"]
401
402An C<E> element whose C<foo> attribute value is a list of whitespace-separated
403values, one of which is exactly equal to C<bar>.
404
405 my $foo = $css->select('input[class~="foo"]');
406 my $foo = $css->select('input[class~=foo]');
407
408=head2 E[foo^="bar"]
409
410An C<E> element whose C<foo> attribute value begins exactly with the string
411C<bar>.
412
413 my $begins_with = $css->select('input[name^="f"]');
414 my $begins_with = $css->select('input[name^=f]');
415
416=head2 E[foo$="bar"]
417
418An C<E> element whose C<foo> attribute value ends exactly with the string
419C<bar>.
420
421 my $ends_with = $css->select('input[name$="o"]');
422 my $ends_with = $css->select('input[name$=o]');
423
424=head2 E[foo*="bar"]
425
426An C<E> element whose C<foo> attribute value contains the substring C<bar>.
427
428 my $contains = $css->select('input[name*="fo"]');
429 my $contains = $css->select('input[name*=fo]');
430
431=head2 E[foo|="en"]
432
433An C<E> element whose C<foo> attribute has a hyphen-separated list of values
434beginning (from the left) with C<en>.
435
436 my $english = $css->select('link[hreflang|=en]');
437
438=head2 E:root
439
440An C<E> element, root of the document.
441
442 my $root = $css->select(':root');
443
444=head2 E:nth-child(n)
445
446An C<E> element, the C<n-th> child of its parent.
447
448 my $third = $css->select('div:nth-child(3)');
449 my $odd = $css->select('div:nth-child(odd)');
450 my $even = $css->select('div:nth-child(even)');
451 my $top3 = $css->select('div:nth-child(-n+3)');
452
453=head2 E:nth-last-child(n)
454
455An C<E> element, the C<n-th> child of its parent, counting from the last one.
456
457 my $third = $css->select('div:nth-last-child(3)');
458 my $odd = $css->select('div:nth-last-child(odd)');
459 my $even = $css->select('div:nth-last-child(even)');
460 my $bottom3 = $css->select('div:nth-last-child(-n+3)');
461
462=head2 E:nth-of-type(n)
463
464An C<E> element, the C<n-th> sibling of its type.
465
466 my $third = $css->select('div:nth-of-type(3)');
467 my $odd = $css->select('div:nth-of-type(odd)');
468 my $even = $css->select('div:nth-of-type(even)');
469 my $top3 = $css->select('div:nth-of-type(-n+3)');
470
471=head2 E:nth-last-of-type(n)
472
473An C<E> element, the C<n-th> sibling of its type, counting from the last one.
474
475 my $third = $css->select('div:nth-last-of-type(3)');
476 my $odd = $css->select('div:nth-last-of-type(odd)');
477 my $even = $css->select('div:nth-last-of-type(even)');
478 my $bottom3 = $css->select('div:nth-last-of-type(-n+3)');
479
480=head2 E:first-child
481
482An C<E> element, first child of its parent.
483
484 my $first = $css->select('div p:first-child');
485
486=head2 E:last-child
487
488An C<E> element, last child of its parent.
489
490 my $last = $css->select('div p:last-child');
491
492=head2 E:first-of-type
493
494An C<E> element, first sibling of its type.
495
496 my $first = $css->select('div p:first-of-type');
497
498=head2 E:last-of-type
499
500An C<E> element, last sibling of its type.
501
502 my $last = $css->select('div p:last-of-type');
503
504=head2 E:only-child
505
506An C<E> element, only child of its parent.
507
508 my $lonely = $css->select('div p:only-child');
509
510=head2 E:only-of-type
511
512An C<E> element, only sibling of its type.
513
514 my $lonely = $css->select('div p:only-of-type');
515
516=head2 E:empty
517
518An C<E> element that has no children (including text nodes).
519
520 my $empty = $css->select(':empty');
521
522=head2 E:link
523
524An C<E> element being the source anchor of a hyperlink of which the target is
525not yet visited (C<:link>) or already visited (C<:visited>). Note that
526L<Mojo::DOM::CSS> is not stateful, therefore C<:link> and C<:visited> yield
527exactly the same results.
528
529 my $links = $css->select(':link');
530 my $links = $css->select(':visited');
531
532=head2 E:visited
533
534Alias for L</"E:link">.
535
536=head2 E:checked
537
538A user interface element C<E> which is checked (for instance a radio-button or
539checkbox).
540
541 my $input = $css->select(':checked');
542
543=head2 E.warning
544
545An C<E> element whose class is "warning".
546
547 my $warning = $css->select('div.warning');
548
549=head2 E#myid
550
551An C<E> element with C<ID> equal to "myid".
552
553 my $foo = $css->select('div#foo');
554
555=head2 E:not(s1, s2)
556
557An C<E> element that does not match either compound selector C<s1> or compound
558selector C<s2>. Note that support for compound selectors is EXPERIMENTAL and
559might change without warning!
560
561 my $others = $css->select('div p:not(:first-child, :last-child)');
562
563Support for compound selectors was added as part of
564L<Selectors Level 4|http://dev.w3.org/csswg/selectors-4>, which is still a work
565in progress.
566
567=head2 E:matches(s1, s2)
568
569An C<E> element that matches compound selector C<s1> and/or compound selector
570C<s2>. Note that this selector is EXPERIMENTAL and might change without warning!
571
572 my $headers = $css->select(':matches(section, article, aside, nav) h1');
573
574This selector is part of
575L<Selectors Level 4|http://dev.w3.org/csswg/selectors-4>, which is still a work
576in progress.
577
578=head2 A|E
579
580An C<E> element that belongs to the namespace alias C<A> from
581L<CSS Namespaces Module Level 3|https://www.w3.org/TR/css-namespaces-3/>.
582Key/value pairs passed to selector methods are used to declare namespace
583aliases.
584
585 my $elem = $css->select('lq|elem', lq => 'http://example.com/q-markup');
586
587Using an empty alias searches for an element that belongs to no namespace.
588
589 my $div = $c->select('|div');
590
591=head2 E F
592
593An C<F> element descendant of an C<E> element.
594
595 my $headlines = $css->select('div h1');
596
597=head2 E E<gt> F
598
599An C<F> element child of an C<E> element.
600
601 my $headlines = $css->select('html > body > div > h1');
602
603=head2 E + F
604
605An C<F> element immediately preceded by an C<E> element.
606
607 my $second = $css->select('h1 + h2');
608
609=head2 E ~ F
610
611An C<F> element preceded by an C<E> element.
612
613 my $second = $css->select('h1 ~ h2');
614
615=head2 E, F, G
616
617Elements of type C<E>, C<F> and C<G>.
618
619 my $headlines = $css->select('h1, h2, h3');
620
621=head2 E[foo=bar][bar=baz]
622
623An C<E> element whose attributes match all following attribute selectors.
624
625 my $links = $css->select('a[foo^=b][foo$=ar]');
626
627=head1 ATTRIBUTES
628
629L<Mojo::DOM::CSS> implements the following attributes.
630
631=head2 tree
632
633 my $tree = $css->tree;
634 $css = $css->tree(['root']);
635
636Document Object Model. Note that this structure should only be used very
637carefully since it is very dynamic.
638
639=head1 METHODS
640
641L<Mojo::DOM::CSS> inherits all methods from L<Mojo::Base> and implements the
642following new ones.
643
644=head2 matches
645
646 my $bool = $css->matches('head > title');
647 my $bool = $css->matches('svg|line', svg => 'http://www.w3.org/2000/svg');
648
649Check if first node in L</"tree"> matches the CSS selector. Trailing key/value
650pairs can be used to declare xml namespace aliases.
651
652=head2 select
653
654 my $results = $css->select('head > title');
655 my $results = $css->select('svg|line', svg => 'http://www.w3.org/2000/svg');
656
657Run CSS selector against L</"tree">. Trailing key/value pairs can be used to
658declare xml namespace aliases.
659
660=head2 select_one
661
662 my $result = $css->select_one('head > title');
663 my $result =
664 $css->select_one('svg|line', svg => 'http://www.w3.org/2000/svg');
665
666Run CSS selector against L</"tree"> and stop as soon as the first node matched.
667Trailing key/value pairs can be used to declare xml namespace aliases.
668
669=head1 SEE ALSO
670
671L<Mojolicious>, L<Mojolicious::Guides>, L<https://mojolicious.org>.
672
673=cut
Note: See TracBrowser for help on using the repository browser.