source: main/trunk/greenstone2/perllib/cpan/Mojo/DOM.pm@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File size: 29.9 KB
RevLine 
[32205]1package Mojo::DOM;
2use Mojo::Base -strict;
3use overload
4 '@{}' => sub { shift->child_nodes },
5 '%{}' => sub { shift->attr },
6 bool => sub {1},
7 '""' => sub { shift->to_string },
8 fallback => 1;
9
10# "Fry: This snow is beautiful. I'm glad global warming never happened.
11# Leela: Actually, it did. But thank God nuclear winter canceled it out."
12use Mojo::Collection;
13use Mojo::DOM::CSS;
14use Mojo::DOM::HTML;
15use Scalar::Util qw(blessed weaken);
16use Storable 'dclone';
17
18sub all_text { _text(_nodes(shift->tree), 1) }
19
20sub ancestors { _select($_[0]->_collect([_ancestors($_[0]->tree)]), $_[1]) }
21
22sub append { shift->_add(1, @_) }
23sub append_content { shift->_content(1, 0, @_) }
24
25sub at {
26 my $self = shift;
27 return undef unless my $result = $self->_css->select_one(@_);
28 return $self->_build($result, $self->xml);
29}
30
31sub attr {
32 my $self = shift;
33
34 # Hash
35 my $tree = $self->tree;
36 my $attrs = $tree->[0] ne 'tag' ? {} : $tree->[2];
37 return $attrs unless @_;
38
39 # Get
40 return $attrs->{$_[0]} unless @_ > 1 || ref $_[0];
41
42 # Set
43 my $values = ref $_[0] ? $_[0] : {@_};
44 @$attrs{keys %$values} = values %$values;
45
46 return $self;
47}
48
49sub child_nodes { $_[0]->_collect(_nodes($_[0]->tree)) }
50
51sub children { _select($_[0]->_collect(_nodes($_[0]->tree, 1)), $_[1]) }
52
53sub content {
54 my $self = shift;
55
56 my $type = $self->type;
57 if ($type eq 'root' || $type eq 'tag') {
58 return $self->_content(0, 1, @_) if @_;
59 my $html = Mojo::DOM::HTML->new(xml => $self->xml);
60 return join '', map { $html->tree($_)->render } @{_nodes($self->tree)};
61 }
62
63 return $self->tree->[1] unless @_;
64 $self->tree->[1] = shift;
65 return $self;
66}
67
68sub descendant_nodes { $_[0]->_collect(_all(_nodes($_[0]->tree))) }
69
70sub find {
71 my $self = shift;
72 return $self->_collect($self->_css->select(@_));
73}
74
75sub following { _select($_[0]->_collect(_siblings($_[0]->tree, 1, 1)), $_[1]) }
76sub following_nodes { $_[0]->_collect(_siblings($_[0]->tree, 0, 1)) }
77
78sub matches { shift->_css->matches(@_) }
79
80sub namespace {
81 my $self = shift;
82
83 return undef if (my $tree = $self->tree)->[0] ne 'tag';
84
85 # Extract namespace prefix and search parents
86 my $ns = $tree->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef;
87 for my $node ($tree, _ancestors($tree)) {
88
89 # Namespace for prefix
90 my $attrs = $node->[2];
91 if ($ns) { $_ eq $ns and return $attrs->{$_} for keys %$attrs }
92
93 # Namespace attribute
94 elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} }
95 }
96
97 return undef;
98}
99
100sub new {
101 my $class = shift;
102 my $self = bless \Mojo::DOM::HTML->new, ref $class || $class;
103 return @_ ? $self->parse(@_) : $self;
104}
105
106sub new_tag {
107 my $self = shift;
108 my $new = $self->new;
109 $$new->tag(@_);
110 $$new->xml($$self->xml) if ref $self;
111 return $new;
112}
113
114sub next { $_[0]->_maybe(_siblings($_[0]->tree, 1, 1, 0)) }
115sub next_node { $_[0]->_maybe(_siblings($_[0]->tree, 0, 1, 0)) }
116
117sub parent {
118 my $self = shift;
119 return undef if (my $tree = $self->tree)->[0] eq 'root';
120 return $self->_build(_parent($tree), $self->xml);
121}
122
123sub parse { ${$_[0]}->parse($_[1]) and return $_[0] }
124
125sub preceding { _select($_[0]->_collect(_siblings($_[0]->tree, 1, 0)), $_[1]) }
126sub preceding_nodes { $_[0]->_collect(_siblings($_[0]->tree, 0)) }
127
128sub prepend { shift->_add(0, @_) }
129sub prepend_content { shift->_content(0, 0, @_) }
130
131sub previous { $_[0]->_maybe(_siblings($_[0]->tree, 1, 0, -1)) }
132sub previous_node { $_[0]->_maybe(_siblings($_[0]->tree, 0, 0, -1)) }
133
134sub remove { shift->replace('') }
135
136sub replace {
137 my ($self, $new) = @_;
138 return $self->parse($new) if (my $tree = $self->tree)->[0] eq 'root';
139 return $self->_replace(_parent($tree), $tree, _nodes($self->_parse($new)));
140}
141
142sub root {
143 my $self = shift;
144 return $self unless my $tree = _ancestors($self->tree, 1);
145 return $self->_build($tree, $self->xml);
146}
147
148sub selector {
149 return undef unless (my $tree = shift->tree)->[0] eq 'tag';
150 return join ' > ',
151 reverse map { $_->[1] . ':nth-child(' . (@{_siblings($_, 1)} + 1) . ')' }
152 $tree, _ancestors($tree);
153}
154
155sub strip {
156 my $self = shift;
157 return $self if (my $tree = $self->tree)->[0] ne 'tag';
158 return $self->_replace($tree->[3], $tree, _nodes($tree));
159}
160
161sub tag {
162 my ($self, $tag) = @_;
163 return undef if (my $tree = $self->tree)->[0] ne 'tag';
164 return $tree->[1] unless $tag;
165 $tree->[1] = $tag;
166 return $self;
167}
168
169sub tap { shift->Mojo::Base::tap(@_) }
170
171sub text { _text(_nodes(shift->tree), 0) }
172
173sub to_string { ${shift()}->render }
174
175sub tree { @_ > 1 ? (${$_[0]}->tree($_[1]) and return $_[0]) : ${$_[0]}->tree }
176
177sub type { shift->tree->[0] }
178
179sub val {
180 my $self = shift;
181
182 # "option"
183 return $self->{value} // $self->text if (my $tag = $self->tag) eq 'option';
184
185 # "input" ("type=checkbox" and "type=radio")
186 my $type = $self->{type} // '';
187 return $self->{value} // 'on'
188 if $tag eq 'input' && ($type eq 'radio' || $type eq 'checkbox');
189
190 # "textarea", "input" or "button"
191 return $tag eq 'textarea' ? $self->text : $self->{value} if $tag ne 'select';
192
193 # "select"
194 my $v = $self->find('option:checked:not([disabled])')
195 ->grep(sub { !$_->ancestors('optgroup[disabled]')->size })->map('val');
196 return exists $self->{multiple} ? $v->size ? $v->to_array : undef : $v->last;
197}
198
199sub with_roles { shift->Mojo::Base::with_roles(@_) }
200
201sub wrap { shift->_wrap(0, @_) }
202sub wrap_content { shift->_wrap(1, @_) }
203
204sub xml { @_ > 1 ? (${$_[0]}->xml($_[1]) and return $_[0]) : ${$_[0]}->xml }
205
206sub _add {
207 my ($self, $offset, $new) = @_;
208
209 return $self if (my $tree = $self->tree)->[0] eq 'root';
210
211 my $parent = _parent($tree);
212 splice @$parent, _offset($parent, $tree) + $offset, 0,
213 @{_link($parent, _nodes($self->_parse($new)))};
214
215 return $self;
216}
217
218sub _all {
219 my $nodes = shift;
220 @$nodes = map { $_->[0] eq 'tag' ? ($_, @{_all(_nodes($_))}) : ($_) } @$nodes;
221 return $nodes;
222}
223
224sub _ancestors {
225 my ($tree, $root) = @_;
226
227 return () unless $tree = _parent($tree);
228 my @ancestors;
229 do { push @ancestors, $tree }
230 while ($tree->[0] eq 'tag') && ($tree = $tree->[3]);
231 return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1];
232}
233
234sub _build { shift->new->tree(shift)->xml(shift) }
235
236sub _collect {
237 my ($self, $nodes) = (shift, shift // []);
238 my $xml = $self->xml;
239 return Mojo::Collection->new(map { $self->_build($_, $xml) } @$nodes);
240}
241
242sub _content {
243 my ($self, $start, $offset, $new) = @_;
244
245 my $tree = $self->tree;
246 unless ($tree->[0] eq 'root' || $tree->[0] eq 'tag') {
247 my $old = $self->content;
248 return $self->content($start ? $old . $new : $new . $old);
249 }
250
251 $start = $start ? ($#$tree + 1) : _start($tree);
252 $offset = $offset ? $#$tree : 0;
253 splice @$tree, $start, $offset, @{_link($tree, _nodes($self->_parse($new)))};
254
255 return $self;
256}
257
258sub _css { Mojo::DOM::CSS->new(tree => shift->tree) }
259
260sub _fragment { _link(my $r = ['root', @_], [@_]); $r }
261
262sub _link {
263 my ($parent, $children) = @_;
264
265 # Link parent to children
266 for my $node (@$children) {
267 my $offset = $node->[0] eq 'tag' ? 3 : 2;
268 $node->[$offset] = $parent;
269 weaken $node->[$offset];
270 }
271
272 return $children;
273}
274
275sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef }
276
277sub _nodes {
278 return () unless my $tree = shift;
279 my @nodes = @$tree[_start($tree) .. $#$tree];
280 return shift() ? [grep { $_->[0] eq 'tag' } @nodes] : \@nodes;
281}
282
283sub _offset {
284 my ($parent, $child) = @_;
285 my $i = _start($parent);
286 $_ eq $child ? last : $i++ for @$parent[$i .. $#$parent];
287 return $i;
288}
289
290sub _parent { $_[0]->[$_[0][0] eq 'tag' ? 3 : 2] }
291
292sub _parse {
293 my ($self, $input) = @_;
294 return Mojo::DOM::HTML->new(xml => $self->xml)->parse($input)->tree
295 unless blessed $input && $input->isa('Mojo::DOM');
296 my $tree = dclone $input->tree;
297 return $tree->[0] eq 'root' ? $tree : _fragment($tree);
298}
299
300sub _replace {
301 my ($self, $parent, $child, $nodes) = @_;
302 splice @$parent, _offset($parent, $child), 1, @{_link($parent, $nodes)};
303 return $self->parent;
304}
305
306sub _select { $_[1] ? $_[0]->grep(matches => $_[1]) : $_[0] }
307
308sub _siblings {
309 my ($tree, $tags, $tail, $i) = @_;
310
311 return defined $i ? undef : [] if $tree->[0] eq 'root';
312
313 my $nodes = _nodes(_parent($tree));
314 my $match = -1;
315 defined($match++) and $_ eq $tree and last for @$nodes;
316
317 if ($tail) { splice @$nodes, 0, $match + 1 }
318 else { splice @$nodes, $match, ($#$nodes + 1) - $match }
319
320 @$nodes = grep { $_->[0] eq 'tag' } @$nodes if $tags;
321
322 return defined $i ? $i == -1 && !@$nodes ? undef : $nodes->[$i] : $nodes;
323}
324
325sub _start { $_[0][0] eq 'root' ? 1 : 4 }
326
327sub _text {
328 my ($nodes, $all) = @_;
329
330 my $text = '';
331 while (my $node = shift @$nodes) {
332 my $type = $node->[0];
333
334 # Text
335 if ($type eq 'text' || $type eq 'cdata' || $type eq 'raw') {
336 $text .= $node->[1];
337 }
338
339 # Nested tag
340 elsif ($type eq 'tag' && $all) { unshift @$nodes, @{_nodes($node)} }
341 }
342
343 return $text;
344}
345
346sub _wrap {
347 my ($self, $content, $new) = @_;
348
349 return $self if (my $tree = $self->tree)->[0] eq 'root' && !$content;
350 return $self if $tree->[0] ne 'root' && $tree->[0] ne 'tag' && $content;
351
352 # Find innermost tag
353 my $current;
354 my $first = $new = $self->_parse($new);
355 $current = $first while $first = _nodes($first, 1)->[0];
356 return $self unless $current;
357
358 # Wrap content
359 if ($content) {
360 push @$current, @{_link($current, _nodes($tree))};
361 splice @$tree, _start($tree), $#$tree, @{_link($tree, _nodes($new))};
362 return $self;
363 }
364
365 # Wrap element
366 $self->_replace(_parent($tree), $tree, _nodes($new));
367 push @$current, @{_link($current, [$tree])};
368 return $self;
369}
370
3711;
372
373=encoding utf8
374
375=head1 NAME
376
377Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors
378
379=head1 SYNOPSIS
380
381 use Mojo::DOM;
382
383 # Parse
384 my $dom = Mojo::DOM->new('<div><p id="a">Test</p><p id="b">123</p></div>');
385
386 # Find
387 say $dom->at('#b')->text;
388 say $dom->find('p')->map('text')->join("\n");
389 say $dom->find('[id]')->map(attr => 'id')->join("\n");
390
391 # Iterate
392 $dom->find('p[id]')->reverse->each(sub { say $_->{id} });
393
394 # Loop
395 for my $e ($dom->find('p[id]')->each) {
396 say $e->{id}, ':', $e->text;
397 }
398
399 # Modify
400 $dom->find('div p')->last->append('<p id="c">456</p>');
401 $dom->at('#c')->prepend($dom->new_tag('p', id => 'd', '789'));
402 $dom->find(':not(p)')->map('strip');
403
404 # Render
405 say "$dom";
406
407=head1 DESCRIPTION
408
409L<Mojo::DOM> is a minimalistic and relaxed HTML/XML DOM parser with CSS
410selector support. It will even try to interpret broken HTML and XML, so you
411should not use it for validation.
412
413=head1 NODES AND ELEMENTS
414
415When we parse an HTML/XML fragment, it gets turned into a tree of nodes.
416
417 <!DOCTYPE html>
418 <html>
419 <head><title>Hello</title></head>
420 <body>World!</body>
421 </html>
422
423There are currently eight different kinds of nodes, C<cdata>, C<comment>,
424C<doctype>, C<pi>, C<raw>, C<root>, C<tag> and C<text>. Elements are nodes of
425the type C<tag>.
426
427 root
428 |- doctype (html)
429 +- tag (html)
430 |- tag (head)
431 | +- tag (title)
432 | +- raw (Hello)
433 +- tag (body)
434 +- text (World!)
435
436While all node types are represented as L<Mojo::DOM> objects, some methods like
437L</"attr"> and L</"namespace"> only apply to elements.
438
439=head1 CASE-SENSITIVITY
440
441L<Mojo::DOM> defaults to HTML semantics, that means all tags and attribute
442names are lowercased and selectors need to be lowercase as well.
443
444 # HTML semantics
445 my $dom = Mojo::DOM->new('<P ID="greeting">Hi!</P>');
446 say $dom->at('p[id]')->text;
447
448If an XML declaration is found, the parser will automatically switch into XML
449mode and everything becomes case-sensitive.
450
451 # XML semantics
452 my $dom = Mojo::DOM->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>');
453 say $dom->at('P[ID]')->text;
454
455HTML or XML semantics can also be forced with the L</"xml"> method.
456
457 # Force HTML semantics
458 my $dom = Mojo::DOM->new->xml(0)->parse('<P ID="greeting">Hi!</P>');
459 say $dom->at('p[id]')->text;
460
461 # Force XML semantics
462 my $dom = Mojo::DOM->new->xml(1)->parse('<P ID="greeting">Hi!</P>');
463 say $dom->at('P[ID]')->text;
464
465=head1 METHODS
466
467L<Mojo::DOM> implements the following methods.
468
469=head2 all_text
470
471 my $text = $dom->all_text;
472
473Extract text content from all descendant nodes of this element.
474
475 # "foo\nbarbaz\n"
476 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->all_text;
477
478=head2 ancestors
479
480 my $collection = $dom->ancestors;
481 my $collection = $dom->ancestors('div ~ p');
482
483Find all ancestor elements of this node matching the CSS selector and return a
484L<Mojo::Collection> object containing these elements as L<Mojo::DOM> objects.
485All selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
486
487 # List tag names of ancestor elements
488 say $dom->ancestors->map('tag')->join("\n");
489
490=head2 append
491
492 $dom = $dom->append('<p>I ♥ Mojolicious!</p>');
493 $dom = $dom->append(Mojo::DOM->new);
494
495Append HTML/XML fragment to this node (for all node types other than C<root>).
496
497 # "<div><h1>Test</h1><h2>123</h2></div>"
498 $dom->parse('<div><h1>Test</h1></div>')
499 ->at('h1')->append('<h2>123</h2>')->root;
500
501 # "<p>Test 123</p>"
502 $dom->parse('<p>Test</p>')->at('p')
503 ->child_nodes->first->append(' 123')->root;
504
505=head2 append_content
506
507 $dom = $dom->append_content('<p>I ♥ Mojolicious!</p>');
508 $dom = $dom->append_content(Mojo::DOM->new);
509
510Append HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
511node's content.
512
513 # "<div><h1>Test123</h1></div>"
514 $dom->parse('<div><h1>Test</h1></div>')
515 ->at('h1')->append_content('123')->root;
516
517 # "<!-- Test 123 --><br>"
518 $dom->parse('<!-- Test --><br>')
519 ->child_nodes->first->append_content('123 ')->root;
520
521 # "<p>Test<i>123</i></p>"
522 $dom->parse('<p>Test</p>')->at('p')->append_content('<i>123</i>')->root;
523
524=head2 at
525
526 my $result = $dom->at('div ~ p');
527 my $result = $dom->at('svg|line', svg => 'http://www.w3.org/2000/svg');
528
529Find first descendant element of this element matching the CSS selector and
530return it as a L<Mojo::DOM> object, or C<undef> if none could be found. All
531selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
532
533 # Find first element with "svg" namespace definition
534 my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'};
535
536Trailing key/value pairs can be used to declare xml namespace aliases.
537
538 # "<rect />"
539 $dom->parse('<svg xmlns="http://www.w3.org/2000/svg"><rect /></svg>')
540 ->at('svg|rect', svg => 'http://www.w3.org/2000/svg');
541
542=head2 attr
543
544 my $hash = $dom->attr;
545 my $foo = $dom->attr('foo');
546 $dom = $dom->attr({foo => 'bar'});
547 $dom = $dom->attr(foo => 'bar');
548
549This element's attributes.
550
551 # Remove an attribute
552 delete $dom->attr->{id};
553
554 # Attribute without value
555 $dom->attr(selected => undef);
556
557 # List id attributes
558 say $dom->find('*')->map(attr => 'id')->compact->join("\n");
559
560=head2 child_nodes
561
562 my $collection = $dom->child_nodes;
563
564Return a L<Mojo::Collection> object containing all child nodes of this element
565as L<Mojo::DOM> objects.
566
567 # "<p><b>123</b></p>"
568 $dom->parse('<p>Test<b>123</b></p>')->at('p')->child_nodes->first->remove;
569
570 # "<!DOCTYPE html>"
571 $dom->parse('<!DOCTYPE html><b>123</b>')->child_nodes->first;
572
573 # " Test "
574 $dom->parse('<b>123</b><!-- Test -->')->child_nodes->last->content;
575
576=head2 children
577
578 my $collection = $dom->children;
579 my $collection = $dom->children('div ~ p');
580
581Find all child elements of this element matching the CSS selector and return a
582L<Mojo::Collection> object containing these elements as L<Mojo::DOM> objects.
583All selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
584
585 # Show tag name of random child element
586 say $dom->children->shuffle->first->tag;
587
588=head2 content
589
590 my $str = $dom->content;
591 $dom = $dom->content('<p>I ♥ Mojolicious!</p>');
592 $dom = $dom->content(Mojo::DOM->new);
593
594Return this node's content or replace it with HTML/XML fragment (for C<root>
595and C<tag> nodes) or raw content.
596
597 # "<b>Test</b>"
598 $dom->parse('<div><b>Test</b></div>')->at('div')->content;
599
600 # "<div><h1>123</h1></div>"
601 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('123')->root;
602
603 # "<p><i>123</i></p>"
604 $dom->parse('<p>Test</p>')->at('p')->content('<i>123</i>')->root;
605
606 # "<div><h1></h1></div>"
607 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->content('')->root;
608
609 # " Test "
610 $dom->parse('<!-- Test --><br>')->child_nodes->first->content;
611
612 # "<div><!-- 123 -->456</div>"
613 $dom->parse('<div><!-- Test -->456</div>')
614 ->at('div')->child_nodes->first->content(' 123 ')->root;
615
616=head2 descendant_nodes
617
618 my $collection = $dom->descendant_nodes;
619
620Return a L<Mojo::Collection> object containing all descendant nodes of this
621element as L<Mojo::DOM> objects.
622
623 # "<p><b>123</b></p>"
624 $dom->parse('<p><!-- Test --><b>123<!-- 456 --></b></p>')
625 ->descendant_nodes->grep(sub { $_->type eq 'comment' })
626 ->map('remove')->first;
627
628 # "<p><b>test</b>test</p>"
629 $dom->parse('<p><b>123</b>456</p>')
630 ->at('p')->descendant_nodes->grep(sub { $_->type eq 'text' })
631 ->map(content => 'test')->first->root;
632
633=head2 find
634
635 my $collection = $dom->find('div ~ p');
636 my $collection = $dom->find('svg|line', svg => 'http://www.w3.org/2000/svg');
637
638Find all descendant elements of this element matching the CSS selector and
639return a L<Mojo::Collection> object containing these elements as L<Mojo::DOM>
640objects. All selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
641
642 # Find a specific element and extract information
643 my $id = $dom->find('div')->[23]{id};
644
645 # Extract information from multiple elements
646 my @headers = $dom->find('h1, h2, h3')->map('text')->each;
647
648 # Count all the different tags
649 my $hash = $dom->find('*')->reduce(sub { $a->{$b->tag}++; $a }, {});
650
651 # Find elements with a class that contains dots
652 my @divs = $dom->find('div.foo\.bar')->each;
653
654Trailing key/value pairs can be used to declare xml namespace aliases.
655
656 # "<rect />"
657 $dom->parse('<svg xmlns="http://www.w3.org/2000/svg"><rect /></svg>')
658 ->find('svg|rect', svg => 'http://www.w3.org/2000/svg')->first;
659
660=head2 following
661
662 my $collection = $dom->following;
663 my $collection = $dom->following('div ~ p');
664
665Find all sibling elements after this node matching the CSS selector and return
666a L<Mojo::Collection> object containing these elements as L<Mojo::DOM> objects.
667All selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
668
669 # List tags of sibling elements after this node
670 say $dom->following->map('tag')->join("\n");
671
672=head2 following_nodes
673
674 my $collection = $dom->following_nodes;
675
676Return a L<Mojo::Collection> object containing all sibling nodes after this
677node as L<Mojo::DOM> objects.
678
679 # "C"
680 $dom->parse('<p>A</p><!-- B -->C')->at('p')->following_nodes->last->content;
681
682=head2 matches
683
684 my $bool = $dom->matches('div ~ p');
685 my $bool = $dom->matches('svg|line', svg => 'http://www.w3.org/2000/svg');
686
687Check if this element matches the CSS selector. All selectors from
688L<Mojo::DOM::CSS/"SELECTORS"> are supported.
689
690 # True
691 $dom->parse('<p class="a">A</p>')->at('p')->matches('.a');
692 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[class]');
693
694 # False
695 $dom->parse('<p class="a">A</p>')->at('p')->matches('.b');
696 $dom->parse('<p class="a">A</p>')->at('p')->matches('p[id]');
697
698Trailing key/value pairs can be used to declare xml namespace aliases.
699
700 # True
701 $dom->parse('<svg xmlns="http://www.w3.org/2000/svg"><rect /></svg>')
702 ->matches('svg|rect', svg => 'http://www.w3.org/2000/svg');
703
704=head2 namespace
705
706 my $namespace = $dom->namespace;
707
708Find this element's namespace, or return C<undef> if none could be found.
709
710 # Find namespace for an element with namespace prefix
711 my $namespace = $dom->at('svg > svg\:circle')->namespace;
712
713 # Find namespace for an element that may or may not have a namespace prefix
714 my $namespace = $dom->at('svg > circle')->namespace;
715
716=head2 new
717
718 my $dom = Mojo::DOM->new;
719 my $dom = Mojo::DOM->new('<foo bar="baz">I ♥ Mojolicious!</foo>');
720
721Construct a new scalar-based L<Mojo::DOM> object and L</"parse"> HTML/XML
722fragment if necessary.
723
724=head2 new_tag
725
726 my $tag = Mojo::DOM->new_tag('div');
727 my $tag = $dom->new_tag('div');
728 my $tag = $dom->new_tag('div', id => 'foo', hidden => undef);
729 my $tag = $dom->new_tag('div', 'safe content');
730 my $tag = $dom->new_tag('div', id => 'foo', 'safe content');
731 my $tag = $dom->new_tag('div', data => {mojo => 'rocks'}, 'safe content');
732 my $tag = $dom->new_tag('div', id => 'foo', sub { 'unsafe content' });
733
734Construct a new L<Mojo::DOM> object for an HTML/XML tag with or without
735attributes and content. The C<data> attribute may contain a hash reference with
736key/value pairs to generate attributes from.
737
738 # "<br>"
739 $dom->new_tag('br');
740
741 # "<div></div>"
742 $dom->new_tag('div');
743
744 # "<div id="foo" hidden></div>"
745 $dom->new_tag('div', id => 'foo', hidden => undef);
746
747 # "<div>test &amp; 123</div>"
748 $dom->new_tag('div', 'test & 123');
749
750 # "<div id="foo">test &amp; 123</div>"
751 $dom->new_tag('div', id => 'foo', 'test & 123');
752
753 # "<div data-foo="1" data-bar="test">test &amp; 123</div>""
754 $dom->new_tag('div', data => {foo => 1, Bar => 'test'}, 'test & 123');
755
756 # "<div id="foo">test & 123</div>"
757 $dom->new_tag('div', id => 'foo', sub { 'test & 123' });
758
759 # "<div>Hello<b>Mojo!</b></div>"
760 $dom->parse('<div>Hello</div>')->at('div')
761 ->append_content($dom->new_tag('b', 'Mojo!'))->root;
762
763=head2 next
764
765 my $sibling = $dom->next;
766
767Return L<Mojo::DOM> object for next sibling element, or C<undef> if there are no
768more siblings.
769
770 # "<h2>123</h2>"
771 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h1')->next;
772
773=head2 next_node
774
775 my $sibling = $dom->next_node;
776
777Return L<Mojo::DOM> object for next sibling node, or C<undef> if there are no
778more siblings.
779
780 # "456"
781 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
782 ->at('b')->next_node->next_node;
783
784 # " Test "
785 $dom->parse('<p><b>123</b><!-- Test -->456</p>')
786 ->at('b')->next_node->content;
787
788=head2 parent
789
790 my $parent = $dom->parent;
791
792Return L<Mojo::DOM> object for parent of this node, or C<undef> if this node has
793no parent.
794
795 # "<b><i>Test</i></b>"
796 $dom->parse('<p><b><i>Test</i></b></p>')->at('i')->parent;
797
798=head2 parse
799
800 $dom = $dom->parse('<foo bar="baz">I ♥ Mojolicious!</foo>');
801
802Parse HTML/XML fragment with L<Mojo::DOM::HTML>.
803
804 # Parse XML
805 my $dom = Mojo::DOM->new->xml(1)->parse('<foo>I ♥ Mojolicious!</foo>');
806
807=head2 preceding
808
809 my $collection = $dom->preceding;
810 my $collection = $dom->preceding('div ~ p');
811
812Find all sibling elements before this node matching the CSS selector and return
813a L<Mojo::Collection> object containing these elements as L<Mojo::DOM> objects.
814All selectors from L<Mojo::DOM::CSS/"SELECTORS"> are supported.
815
816 # List tags of sibling elements before this node
817 say $dom->preceding->map('tag')->join("\n");
818
819=head2 preceding_nodes
820
821 my $collection = $dom->preceding_nodes;
822
823Return a L<Mojo::Collection> object containing all sibling nodes before this
824node as L<Mojo::DOM> objects.
825
826 # "A"
827 $dom->parse('A<!-- B --><p>C</p>')->at('p')->preceding_nodes->first->content;
828
829=head2 prepend
830
831 $dom = $dom->prepend('<p>I ♥ Mojolicious!</p>');
832 $dom = $dom->prepend(Mojo::DOM->new);
833
834Prepend HTML/XML fragment to this node (for all node types other than C<root>).
835
836 # "<div><h1>Test</h1><h2>123</h2></div>"
837 $dom->parse('<div><h2>123</h2></div>')
838 ->at('h2')->prepend('<h1>Test</h1>')->root;
839
840 # "<p>Test 123</p>"
841 $dom->parse('<p>123</p>')
842 ->at('p')->child_nodes->first->prepend('Test ')->root;
843
844=head2 prepend_content
845
846 $dom = $dom->prepend_content('<p>I ♥ Mojolicious!</p>');
847 $dom = $dom->prepend_content(Mojo::DOM->new);
848
849Prepend HTML/XML fragment (for C<root> and C<tag> nodes) or raw content to this
850node's content.
851
852 # "<div><h2>Test123</h2></div>"
853 $dom->parse('<div><h2>123</h2></div>')
854 ->at('h2')->prepend_content('Test')->root;
855
856 # "<!-- Test 123 --><br>"
857 $dom->parse('<!-- 123 --><br>')
858 ->child_nodes->first->prepend_content(' Test')->root;
859
860 # "<p><i>123</i>Test</p>"
861 $dom->parse('<p>Test</p>')->at('p')->prepend_content('<i>123</i>')->root;
862
863=head2 previous
864
865 my $sibling = $dom->previous;
866
867Return L<Mojo::DOM> object for previous sibling element, or C<undef> if there
868are no more siblings.
869
870 # "<h1>Test</h1>"
871 $dom->parse('<div><h1>Test</h1><h2>123</h2></div>')->at('h2')->previous;
872
873=head2 previous_node
874
875 my $sibling = $dom->previous_node;
876
877Return L<Mojo::DOM> object for previous sibling node, or C<undef> if there are
878no more siblings.
879
880 # "123"
881 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
882 ->at('b')->previous_node->previous_node;
883
884 # " Test "
885 $dom->parse('<p>123<!-- Test --><b>456</b></p>')
886 ->at('b')->previous_node->content;
887
888=head2 remove
889
890 my $parent = $dom->remove;
891
892Remove this node and return L</"root"> (for C<root> nodes) or L</"parent">.
893
894 # "<div></div>"
895 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->remove;
896
897 # "<p><b>456</b></p>"
898 $dom->parse('<p>123<b>456</b></p>')
899 ->at('p')->child_nodes->first->remove->root;
900
901=head2 replace
902
903 my $parent = $dom->replace('<div>I ♥ Mojolicious!</div>');
904 my $parent = $dom->replace(Mojo::DOM->new);
905
906Replace this node with HTML/XML fragment and return L</"root"> (for C<root>
907nodes) or L</"parent">.
908
909 # "<div><h2>123</h2></div>"
910 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->replace('<h2>123</h2>');
911
912 # "<p><b>123</b></p>"
913 $dom->parse('<p>Test</p>')
914 ->at('p')->child_nodes->[0]->replace('<b>123</b>')->root;
915
916=head2 root
917
918 my $root = $dom->root;
919
920Return L<Mojo::DOM> object for C<root> node.
921
922=head2 selector
923
924 my $selector = $dom->selector;
925
926Get a unique CSS selector for this element.
927
928 # "ul:nth-child(1) > li:nth-child(2)"
929 $dom->parse('<ul><li>Test</li><li>123</li></ul>')->find('li')->last->selector;
930
931 # "p:nth-child(1) > b:nth-child(1) > i:nth-child(1)"
932 $dom->parse('<p><b><i>Test</i></b></p>')->at('i')->selector;
933
934=head2 strip
935
936 my $parent = $dom->strip;
937
938Remove this element while preserving its content and return L</"parent">.
939
940 # "<div>Test</div>"
941 $dom->parse('<div><h1>Test</h1></div>')->at('h1')->strip;
942
943=head2 tag
944
945 my $tag = $dom->tag;
946 $dom = $dom->tag('div');
947
948This element's tag name.
949
950 # List tag names of child elements
951 say $dom->children->map('tag')->join("\n");
952
953=head2 tap
954
955 $dom = $dom->tap(sub {...});
956
957Alias for L<Mojo::Base/"tap">.
958
959=head2 text
960
961 my $text = $dom->text;
962
963Extract text content from this element only (not including child elements).
964
965 # "bar"
966 $dom->parse("<div>foo<p>bar</p>baz</div>")->at('p')->text;
967
968 # "foo\nbaz\n"
969 $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->at('div')->text;
970
971=head2 to_string
972
973 my $str = $dom->to_string;
974
975Render this node and its content to HTML/XML.
976
977 # "<b>Test</b>"
978 $dom->parse('<div><b>Test</b></div>')->at('div b')->to_string;
979
980=head2 tree
981
982 my $tree = $dom->tree;
983 $dom = $dom->tree(['root']);
984
985Document Object Model. Note that this structure should only be used very
986carefully since it is very dynamic.
987
988=head2 type
989
990 my $type = $dom->type;
991
992This node's type, usually C<cdata>, C<comment>, C<doctype>, C<pi>, C<raw>,
993C<root>, C<tag> or C<text>.
994
995 # "cdata"
996 $dom->parse('<![CDATA[Test]]>')->child_nodes->first->type;
997
998 # "comment"
999 $dom->parse('<!-- Test -->')->child_nodes->first->type;
1000
1001 # "doctype"
1002 $dom->parse('<!DOCTYPE html>')->child_nodes->first->type;
1003
1004 # "pi"
1005 $dom->parse('<?xml version="1.0"?>')->child_nodes->first->type;
1006
1007 # "raw"
1008 $dom->parse('<title>Test</title>')->at('title')->child_nodes->first->type;
1009
1010 # "root"
1011 $dom->parse('<p>Test</p>')->type;
1012
1013 # "tag"
1014 $dom->parse('<p>Test</p>')->at('p')->type;
1015
1016 # "text"
1017 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->type;
1018
1019=head2 val
1020
1021 my $value = $dom->val;
1022
1023Extract value from form element (such as C<button>, C<input>, C<option>,
1024C<select> and C<textarea>), or return C<undef> if this element has no value. In
1025the case of C<select> with C<multiple> attribute, find C<option> elements with
1026C<selected> attribute and return an array reference with all values, or C<undef>
1027if none could be found.
1028
1029 # "a"
1030 $dom->parse('<input name=test value=a>')->at('input')->val;
1031
1032 # "b"
1033 $dom->parse('<textarea>b</textarea>')->at('textarea')->val;
1034
1035 # "c"
1036 $dom->parse('<option value="c">Test</option>')->at('option')->val;
1037
1038 # "d"
1039 $dom->parse('<select><option selected>d</option></select>')
1040 ->at('select')->val;
1041
1042 # "e"
1043 $dom->parse('<select multiple><option selected>e</option></select>')
1044 ->at('select')->val->[0];
1045
1046 # "on"
1047 $dom->parse('<input name=test type=checkbox>')->at('input')->val;
1048
1049=head2 with_roles
1050
1051 my $new_class = Mojo::DOM->with_roles('Mojo::DOM::Role::One');
1052 my $new_class = Mojo::DOM->with_roles('+One', '+Two');
1053 $dom = $dom->with_roles('+One', '+Two');
1054
1055Alias for L<Mojo::Base/"with_roles">.
1056
1057=head2 wrap
1058
1059 $dom = $dom->wrap('<div></div>');
1060 $dom = $dom->wrap(Mojo::DOM->new);
1061
1062Wrap HTML/XML fragment around this node (for all node types other than C<root>),
1063placing it as the last child of the first innermost element.
1064
1065 # "<p>123<b>Test</b></p>"
1066 $dom->parse('<b>Test</b>')->at('b')->wrap('<p>123</p>')->root;
1067
1068 # "<div><p><b>Test</b></p>123</div>"
1069 $dom->parse('<b>Test</b>')->at('b')->wrap('<div><p></p>123</div>')->root;
1070
1071 # "<p><b>Test</b></p><p>123</p>"
1072 $dom->parse('<b>Test</b>')->at('b')->wrap('<p></p><p>123</p>')->root;
1073
1074 # "<p><b>Test</b></p>"
1075 $dom->parse('<p>Test</p>')->at('p')->child_nodes->first->wrap('<b>')->root;
1076
1077=head2 wrap_content
1078
1079 $dom = $dom->wrap_content('<div></div>');
1080 $dom = $dom->wrap_content(Mojo::DOM->new);
1081
1082Wrap HTML/XML fragment around this node's content (for C<root> and C<tag>
1083nodes), placing it as the last children of the first innermost element.
1084
1085 # "<p><b>123Test</b></p>"
1086 $dom->parse('<p>Test<p>')->at('p')->wrap_content('<b>123</b>')->root;
1087
1088 # "<p><b>Test</b></p><p>123</p>"
1089 $dom->parse('<b>Test</b>')->wrap_content('<p></p><p>123</p>');
1090
1091=head2 xml
1092
1093 my $bool = $dom->xml;
1094 $dom = $dom->xml($bool);
1095
1096Disable HTML semantics in parser and activate case-sensitivity, defaults to
1097auto-detection based on XML declarations.
1098
1099=head1 OPERATORS
1100
1101L<Mojo::DOM> overloads the following operators.
1102
1103=head2 array
1104
1105 my @nodes = @$dom;
1106
1107Alias for L</"child_nodes">.
1108
1109 # "<!-- Test -->"
1110 $dom->parse('<!-- Test --><b>123</b>')->[0];
1111
1112=head2 bool
1113
1114 my $bool = !!$dom;
1115
1116Always true.
1117
1118=head2 hash
1119
1120 my %attrs = %$dom;
1121
1122Alias for L</"attr">.
1123
1124 # "test"
1125 $dom->parse('<div id="test">Test</div>')->at('div')->{id};
1126
1127=head2 stringify
1128
1129 my $str = "$dom";
1130
1131Alias for L</"to_string">.
1132
1133=head1 SEE ALSO
1134
1135L<Mojolicious>, L<Mojolicious::Guides>, L<https://mojolicious.org>.
1136
1137=cut
Note: See TracBrowser for help on using the repository browser.