1 | package HTML::TokeParser::Simple::Token::Tag::Start;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 |
|
---|
5 | use vars qw/ $VERSION $REVISION /;
|
---|
6 | $REVISION = '$Id: Start.pm 13983 2007-03-15 01:32:44Z lh92 $';
|
---|
7 | $VERSION = '1.0';
|
---|
8 | use base 'HTML::TokeParser::Simple::Token::Tag';
|
---|
9 |
|
---|
10 | use HTML::Entities qw/encode_entities/;
|
---|
11 |
|
---|
12 | my %TOKEN = (
|
---|
13 | tag => 1,
|
---|
14 | attr => 2,
|
---|
15 | attrseq => 3,
|
---|
16 | text => 4
|
---|
17 | );
|
---|
18 |
|
---|
19 | my %INSTANCE;
|
---|
20 |
|
---|
21 | sub _init {
|
---|
22 | my $self = shift;
|
---|
23 | if ('S' eq $self->[0]) {
|
---|
24 | $INSTANCE{$self}{offset} = 0;
|
---|
25 | $INSTANCE{$self}{tag} = $self->[1];
|
---|
26 | }
|
---|
27 | else {
|
---|
28 | $INSTANCE{$self}{offset} = -1;
|
---|
29 | my $tag = $self->[0];
|
---|
30 | $tag =~ s/^\///;
|
---|
31 | $INSTANCE{$self}{tag} = $tag;
|
---|
32 | }
|
---|
33 | return $self;
|
---|
34 | }
|
---|
35 |
|
---|
36 | sub _get_offset { return $INSTANCE{+shift}{offset} }
|
---|
37 | sub _get_text { return shift->[-1] }
|
---|
38 |
|
---|
39 | sub _get_tag {
|
---|
40 | my $self = shift;
|
---|
41 | return $INSTANCE{$self}{tag};
|
---|
42 | }
|
---|
43 |
|
---|
44 | sub _get_attrseq {
|
---|
45 | my $self = shift;
|
---|
46 | my $index = $TOKEN{attrseq} + $self->_get_offset;
|
---|
47 | return $self->[$index];
|
---|
48 | }
|
---|
49 |
|
---|
50 | sub _get_attr {
|
---|
51 | my $self = shift;
|
---|
52 | my $index = $TOKEN{attr} + $self->_get_offset;
|
---|
53 | return $self->[$index];
|
---|
54 | }
|
---|
55 |
|
---|
56 | sub DESTROY { delete $INSTANCE{+shift} }
|
---|
57 |
|
---|
58 | sub return_attr { goto &get_attr }
|
---|
59 | sub return_attrseq { goto &get_attrseq }
|
---|
60 | sub return_tag { goto &get_tag }
|
---|
61 |
|
---|
62 | # attribute munging methods
|
---|
63 |
|
---|
64 | sub set_attr {
|
---|
65 | my ($self, $name, $value) = @_;
|
---|
66 | return 'HASH' eq ref $name
|
---|
67 | ? $self->_set_attr_from_hashref($name)
|
---|
68 | : $self->_set_attr_from_string($name, $value);
|
---|
69 | }
|
---|
70 |
|
---|
71 | sub _set_attr_from_string {
|
---|
72 | my ($self, $name, $value) = @_;
|
---|
73 | $name = lc $name;
|
---|
74 | my $attr = $self->get_attr;
|
---|
75 | my $attrseq = $self->get_attrseq;
|
---|
76 | unless (exists $attr->{$name}) {
|
---|
77 | push @$attrseq => $name;
|
---|
78 | }
|
---|
79 | $attr->{$name} = $value;
|
---|
80 | $self->rewrite_tag;
|
---|
81 | }
|
---|
82 |
|
---|
83 | sub _set_attr_from_hashref {
|
---|
84 | my ($self, $attr_hash) = @_;
|
---|
85 | while (my ($attr, $value) = each %$attr_hash) {
|
---|
86 | $self->set_attr($attr, $value);
|
---|
87 | }
|
---|
88 | return $self;
|
---|
89 | }
|
---|
90 |
|
---|
91 | sub rewrite_tag {
|
---|
92 | my $self = shift;
|
---|
93 | my $attr = $self->get_attr;
|
---|
94 | my $attrseq = $self->get_attrseq;
|
---|
95 |
|
---|
96 | # capture the final slash if the tag is self-closing
|
---|
97 | my ($self_closing) = $self->_get_text =~ m{(\s?/)>$};
|
---|
98 | $self_closing ||= '';
|
---|
99 |
|
---|
100 | my $tag = '';
|
---|
101 | foreach ( @$attrseq ) {
|
---|
102 | next if $_ eq '/'; # is this a bug in HTML::TokeParser?
|
---|
103 | $tag .= sprintf qq{ %s="%s"} => $_, encode_entities($attr->{$_});
|
---|
104 | }
|
---|
105 | my $first = $self->is_end_tag ? '/' : '';
|
---|
106 | $tag = sprintf '<%s%s%s%s>', $first, $self->get_tag, $tag, $self_closing;
|
---|
107 | $self->_set_text($tag);
|
---|
108 | return $self;
|
---|
109 | }
|
---|
110 |
|
---|
111 | sub delete_attr {
|
---|
112 | my ($self,$name) = @_;
|
---|
113 | $name = lc $name;
|
---|
114 | my $attr = $self->get_attr;
|
---|
115 | return unless exists $attr->{$name};
|
---|
116 | delete $attr->{$name};
|
---|
117 | my $attrseq = $self->get_attrseq;
|
---|
118 | @$attrseq = grep { $_ ne $name } @$attrseq;
|
---|
119 | $self->rewrite_tag;
|
---|
120 | }
|
---|
121 |
|
---|
122 | # get_foo methods
|
---|
123 |
|
---|
124 | sub return_text {
|
---|
125 | require Carp;
|
---|
126 | Carp::carp('return_text() is deprecated. Use as_is() instead');
|
---|
127 | goto &as_is;
|
---|
128 | }
|
---|
129 |
|
---|
130 | sub as_is {
|
---|
131 | return shift->_get_text;
|
---|
132 | }
|
---|
133 |
|
---|
134 | sub get_tag {
|
---|
135 | return shift->_get_tag;
|
---|
136 | }
|
---|
137 |
|
---|
138 | sub get_token0 {
|
---|
139 | return '';
|
---|
140 | }
|
---|
141 |
|
---|
142 | sub get_attr {
|
---|
143 | my $self = shift;
|
---|
144 | my $attributes = $self->_get_attr;
|
---|
145 | return @_ ? $attributes->{lc shift} : $attributes;
|
---|
146 | }
|
---|
147 |
|
---|
148 | sub get_attrseq {
|
---|
149 | my $self = shift;
|
---|
150 | $self->_get_attrseq;
|
---|
151 | }
|
---|
152 |
|
---|
153 | # is_foo methods
|
---|
154 |
|
---|
155 | sub is_tag {
|
---|
156 | my $self = shift;
|
---|
157 | return $self->is_start_tag( @_ );
|
---|
158 | }
|
---|
159 |
|
---|
160 | sub is_start_tag {
|
---|
161 | my ($self, $tag) = @_;
|
---|
162 | return $tag ? $self->_match_tag($tag) : 1;
|
---|
163 | }
|
---|
164 |
|
---|
165 | sub _match_tag {
|
---|
166 | my ($self, $tag) = @_;
|
---|
167 | return 'Regexp' eq ref $tag
|
---|
168 | ? $self->_get_tag =~ $tag
|
---|
169 | : $self->_get_tag eq lc $tag;
|
---|
170 | }
|
---|
171 |
|
---|
172 | 1;
|
---|
173 |
|
---|
174 | __END__
|
---|
175 |
|
---|
176 | =head1 NAME
|
---|
177 |
|
---|
178 | HTML::TokeParser::Simple::Token::Tag::Start - Token.pm "start tag" class.
|
---|
179 |
|
---|
180 | =head1 SYNOPSIS
|
---|
181 |
|
---|
182 | use HTML::TokeParser::Simple;
|
---|
183 | my $p = HTML::TokeParser::Simple->new( $somefile );
|
---|
184 |
|
---|
185 | while ( my $token = $p->get_token ) {
|
---|
186 | # This prints all text in an HTML doc (i.e., it strips the HTML)
|
---|
187 | next unless $token->is_text;
|
---|
188 | print $token->as_is;
|
---|
189 | }
|
---|
190 |
|
---|
191 | =head1 DESCRIPTION
|
---|
192 |
|
---|
193 | This class does most of the heavy lifting for C<HTML::TokeParser::Simple>. See
|
---|
194 | the C<HTML::TokeParser::Simple> docs for details.
|
---|
195 |
|
---|
196 | =head1 OVERRIDDEN METHODS
|
---|
197 |
|
---|
198 | =over 4
|
---|
199 |
|
---|
200 | =item * as_is
|
---|
201 |
|
---|
202 | =item * delete_attr
|
---|
203 |
|
---|
204 | =item * get_attr
|
---|
205 |
|
---|
206 | =item * get_attrseq
|
---|
207 |
|
---|
208 | =item * get_tag
|
---|
209 |
|
---|
210 | =item * get_token0
|
---|
211 |
|
---|
212 | =item * is_start_tag
|
---|
213 |
|
---|
214 | =item * is_tag
|
---|
215 |
|
---|
216 | =item * return_attr
|
---|
217 |
|
---|
218 | =item * return_attrseq
|
---|
219 |
|
---|
220 | =item * return_tag
|
---|
221 |
|
---|
222 | =item * return_text
|
---|
223 |
|
---|
224 | =item * rewrite_tag
|
---|
225 |
|
---|
226 | =item * set_attr
|
---|
227 |
|
---|
228 | =cut
|
---|
229 |
|
---|