source: trunk/gsdl/perllib/cpan/HTML/TokeParser/Simple/Token/Tag/Start.pm@ 13983

Last change on this file since 13983 was 13983, checked in by lh92, 17 years ago

Added for Realistic Book Project

  • Property svn:keywords set to Author Date Id Revision
File size: 4.6 KB
Line 
1package HTML::TokeParser::Simple::Token::Tag::Start;
2
3use strict;
4
5use vars qw/ $VERSION $REVISION /;
6$REVISION = '$Id: Start.pm 13983 2007-03-15 01:32:44Z lh92 $';
7$VERSION = '1.0';
8use base 'HTML::TokeParser::Simple::Token::Tag';
9
10use HTML::Entities qw/encode_entities/;
11
12my %TOKEN = (
13 tag => 1,
14 attr => 2,
15 attrseq => 3,
16 text => 4
17);
18
19my %INSTANCE;
20
21sub _init {
22 my $self = shift;
23 if ('S' eq $self->[0]) {
24 $INSTANCE{$self}{offset} = 0;
25 $INSTANCE{$self}{tag} = $self->[1];
26 }
27 else {
28 $INSTANCE{$self}{offset} = -1;
29 my $tag = $self->[0];
30 $tag =~ s/^\///;
31 $INSTANCE{$self}{tag} = $tag;
32 }
33 return $self;
34}
35
36sub _get_offset { return $INSTANCE{+shift}{offset} }
37sub _get_text { return shift->[-1] }
38
39sub _get_tag {
40 my $self = shift;
41 return $INSTANCE{$self}{tag};
42}
43
44sub _get_attrseq {
45 my $self = shift;
46 my $index = $TOKEN{attrseq} + $self->_get_offset;
47 return $self->[$index];
48}
49
50sub _get_attr {
51 my $self = shift;
52 my $index = $TOKEN{attr} + $self->_get_offset;
53 return $self->[$index];
54}
55
56sub DESTROY { delete $INSTANCE{+shift} }
57
58sub return_attr { goto &get_attr }
59sub return_attrseq { goto &get_attrseq }
60sub return_tag { goto &get_tag }
61
62# attribute munging methods
63
64sub set_attr {
65 my ($self, $name, $value) = @_;
66 return 'HASH' eq ref $name
67 ? $self->_set_attr_from_hashref($name)
68 : $self->_set_attr_from_string($name, $value);
69}
70
71sub _set_attr_from_string {
72 my ($self, $name, $value) = @_;
73 $name = lc $name;
74 my $attr = $self->get_attr;
75 my $attrseq = $self->get_attrseq;
76 unless (exists $attr->{$name}) {
77 push @$attrseq => $name;
78 }
79 $attr->{$name} = $value;
80 $self->rewrite_tag;
81}
82
83sub _set_attr_from_hashref {
84 my ($self, $attr_hash) = @_;
85 while (my ($attr, $value) = each %$attr_hash) {
86 $self->set_attr($attr, $value);
87 }
88 return $self;
89}
90
91sub rewrite_tag {
92 my $self = shift;
93 my $attr = $self->get_attr;
94 my $attrseq = $self->get_attrseq;
95
96 # capture the final slash if the tag is self-closing
97 my ($self_closing) = $self->_get_text =~ m{(\s?/)>$};
98 $self_closing ||= '';
99
100 my $tag = '';
101 foreach ( @$attrseq ) {
102 next if $_ eq '/'; # is this a bug in HTML::TokeParser?
103 $tag .= sprintf qq{ %s="%s"} => $_, encode_entities($attr->{$_});
104 }
105 my $first = $self->is_end_tag ? '/' : '';
106 $tag = sprintf '<%s%s%s%s>', $first, $self->get_tag, $tag, $self_closing;
107 $self->_set_text($tag);
108 return $self;
109}
110
111sub delete_attr {
112 my ($self,$name) = @_;
113 $name = lc $name;
114 my $attr = $self->get_attr;
115 return unless exists $attr->{$name};
116 delete $attr->{$name};
117 my $attrseq = $self->get_attrseq;
118 @$attrseq = grep { $_ ne $name } @$attrseq;
119 $self->rewrite_tag;
120}
121
122# get_foo methods
123
124sub return_text {
125 require Carp;
126 Carp::carp('return_text() is deprecated. Use as_is() instead');
127 goto &as_is;
128}
129
130sub as_is {
131 return shift->_get_text;
132}
133
134sub get_tag {
135 return shift->_get_tag;
136}
137
138sub get_token0 {
139 return '';
140}
141
142sub get_attr {
143 my $self = shift;
144 my $attributes = $self->_get_attr;
145 return @_ ? $attributes->{lc shift} : $attributes;
146}
147
148sub get_attrseq {
149 my $self = shift;
150 $self->_get_attrseq;
151}
152
153# is_foo methods
154
155sub is_tag {
156 my $self = shift;
157 return $self->is_start_tag( @_ );
158}
159
160sub is_start_tag {
161 my ($self, $tag) = @_;
162 return $tag ? $self->_match_tag($tag) : 1;
163}
164
165sub _match_tag {
166 my ($self, $tag) = @_;
167 return 'Regexp' eq ref $tag
168 ? $self->_get_tag =~ $tag
169 : $self->_get_tag eq lc $tag;
170}
171
1721;
173
174__END__
175
176=head1 NAME
177
178HTML::TokeParser::Simple::Token::Tag::Start - Token.pm "start tag" class.
179
180=head1 SYNOPSIS
181
182 use HTML::TokeParser::Simple;
183 my $p = HTML::TokeParser::Simple->new( $somefile );
184
185 while ( my $token = $p->get_token ) {
186 # This prints all text in an HTML doc (i.e., it strips the HTML)
187 next unless $token->is_text;
188 print $token->as_is;
189 }
190
191=head1 DESCRIPTION
192
193This class does most of the heavy lifting for C<HTML::TokeParser::Simple>. See
194the C<HTML::TokeParser::Simple> docs for details.
195
196=head1 OVERRIDDEN METHODS
197
198=over 4
199
200=item * as_is
201
202=item * delete_attr
203
204=item * get_attr
205
206=item * get_attrseq
207
208=item * get_tag
209
210=item * get_token0
211
212=item * is_start_tag
213
214=item * is_tag
215
216=item * return_attr
217
218=item * return_attrseq
219
220=item * return_tag
221
222=item * return_text
223
224=item * rewrite_tag
225
226=item * set_attr
227
228=cut
229
Note: See TracBrowser for help on using the repository browser.