package HTML::Form; use strict; use URI; use Carp (); use vars qw($VERSION $Encode_available); $VERSION = "5.829"; eval { require Encode }; $Encode_available = !$@; my %form_tags = map {$_ => 1} qw(input textarea button select option); my %type2class = ( text => "TextInput", password => "TextInput", hidden => "TextInput", textarea => "TextInput", "reset" => "IgnoreInput", radio => "ListInput", checkbox => "ListInput", option => "ListInput", button => "SubmitInput", submit => "SubmitInput", image => "ImageInput", file => "FileInput", keygen => "KeygenInput", ); =head1 NAME HTML::Form - Class that represents an HTML form element =head1 SYNOPSIS use HTML::Form; $form = HTML::Form->parse($html, $base_uri); $form->value(query => "Perl"); use LWP::UserAgent; $ua = LWP::UserAgent->new; $response = $ua->request($form->click); =head1 DESCRIPTION Objects of the C class represents a single HTML CformE ... E/formE> instance. A form consists of a sequence of inputs that usually have names, and which can take on various values. The state of a form can be tweaked and it can then be asked to provide C objects that can be passed to the request() method of C. The following methods are available: =over 4 =item @forms = HTML::Form->parse( $html_document, $base_uri ) =item @forms = HTML::Form->parse( $html_document, base => $base_uri, %opt ) =item @forms = HTML::Form->parse( $response, %opt ) The parse() class method will parse an HTML document and build up C objects for each
element found. If called in scalar context only returns the first . Returns an empty list if there are no forms to be found. The required arguments is the HTML document to parse ($html_document) and the URI used to retrieve the document ($base_uri). The base URI is needed to resolve relative action URIs. The provided HTML document should be a Unicode string (or US-ASCII). By default HTML::Form assumes that the original document was UTF-8 encoded and thus encode forms that don't specify an explict I as UTF-8. The charset assumed can be overridden by providing the C option to parse(). It's a good idea to be explict about this parameter as well, thus the recommended simplest invocation becomes: my @forms = HTML::Form->parse( Encode::decode($encoding, $html_document_bytes), base => $base_uri, charset => $encoding, ); If the document was retrieved with LWP then the response object provide methods to obtain a proper value for C and C: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response->decoded_content, base => $response->base, charset => $response->content_charset, ); In fact, the parse() method can parse from an C object directly, so the example above can be more conveniently written as: my $ua = LWP::UserAgent->new; my $response = $ua->get("http://www.example.com/form.html"); my @forms = HTML::Form->parse($response); Note that any object that implements a decoded_content(), base() and content_charset() method with similar behaviour as C will do. Additional options might be passed in to control how the parse method behaves. The following are all the options currently recognized: =over =item C<< base => $uri >> This is the URI used to retrive the original document. This option is not optional ;-) =item C<< charset => $str >> Specify what charset the original document was encoded in. This is used as the default for accept_charset. If not provided this defaults to "UTF-8". =item C<< verbose => $bool >> Warn (print messages to STDERR) about any bad HTML form constructs found. You can trap these with $SIG{__WARN__}. =item C<< strict => $bool >> Initialize any form objects with the given strict attribute. =back =cut sub parse { my $class = shift; my $html = shift; unshift(@_, "base") if @_ == 1; my %opt = @_; require HTML::TokeParser; my $p = HTML::TokeParser->new(ref($html) ? $html->decoded_content(ref => 1) : \$html); die "Failed to create HTML::TokeParser object" unless $p; my $base_uri = delete $opt{base}; my $charset = delete $opt{charset}; my $strict = delete $opt{strict}; my $verbose = delete $opt{verbose}; if ($^W) { Carp::carp("Unrecognized option $_ in HTML::Form->parse") for sort keys %opt; } unless (defined $base_uri) { if (ref($html)) { $base_uri = $html->base; } else { Carp::croak("HTML::Form::parse: No \$base_uri provided"); } } unless (defined $charset) { if (ref($html) and $html->can("content_charset")) { $charset = $html->content_charset; } unless ($charset) { $charset = "UTF-8"; } } my @forms; my $f; # current form my %openselect; # index to the open instance of a select while (my $t = $p->get_tag) { my($tag,$attr) = @$t; if ($tag eq "form") { my $action = delete $attr->{'action'}; $action = "" unless defined $action; $action = URI->new_abs($action, $base_uri); $f = $class->new($attr->{'method'}, $action, $attr->{'enctype'}); $f->accept_charset($attr->{'accept-charset'}) if $attr->{'accept-charset'}; $f->{default_charset} = $charset; $f->{attr} = $attr; $f->strict(1) if $strict; %openselect = (); push(@forms, $f); my(%labels, $current_label); while (my $t = $p->get_tag) { my($tag, $attr) = @$t; last if $tag eq "/form"; # if we are inside a label tag, then keep # appending any text to the current label if(defined $current_label) { $current_label = join " ", grep { defined and length } $current_label, $p->get_phrase; } if ($tag eq "input") { $attr->{value_name} = exists $attr->{id} && exists $labels{$attr->{id}} ? $labels{$attr->{id}} : defined $current_label ? $current_label : $p->get_phrase; } if ($tag eq "label") { $current_label = $p->get_phrase; $labels{ $attr->{for} } = $current_label if exists $attr->{for}; } elsif ($tag eq "/label") { $current_label = undef; } elsif ($tag eq "input") { my $type = delete $attr->{type} || "text"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "button") { my $type = delete $attr->{type} || "submit"; $f->push_input($type, $attr, $verbose); } elsif ($tag eq "textarea") { $attr->{textarea_value} = $attr->{value} if exists $attr->{value}; my $text = $p->get_text("/textarea"); $attr->{value} = $text; $f->push_input("textarea", $attr, $verbose); } elsif ($tag eq "select") { # rename attributes reserved to come for the option tag for ("value", "value_name") { $attr->{"select_$_"} = delete $attr->{$_} if exists $attr->{$_}; } # count this new select option separately my $name = $attr->{name}; $name = "" unless defined $name; $openselect{$name}++; while ($t = $p->get_tag) { my $tag = shift @$t; last if $tag eq "/select"; next if $tag =~ m,/?optgroup,; next if $tag eq "/option"; if ($tag eq "option") { my %a = %{$t->[0]}; # rename keys so they don't clash with %attr for (keys %a) { next if $_ eq "value"; $a{"option_$_"} = delete $a{$_}; } while (my($k,$v) = each %$attr) { $a{$k} = $v; } $a{value_name} = $p->get_trimmed_text; $a{value} = delete $a{value_name} unless defined $a{value}; $a{idx} = $openselect{$name}; $f->push_input("option", \%a, $verbose); } else { warn("Bad here, so we # try to do the same. Actually the MSIE behaviour # appears really strange: and