1 | # $Id: XMLParser.pm 7909 2004-08-06 05:11:55Z mdewsnip $
|
---|
2 |
|
---|
3 | package XML::XPath::XMLParser;
|
---|
4 |
|
---|
5 | use strict;
|
---|
6 |
|
---|
7 | use XML::Parser;
|
---|
8 | #use XML::XPath;
|
---|
9 | use XML::XPath::Node;
|
---|
10 | use XML::XPath::Node::Element;
|
---|
11 | use XML::XPath::Node::Text;
|
---|
12 | use XML::XPath::Node::Comment;
|
---|
13 | use XML::XPath::Node::PI;
|
---|
14 | use XML::XPath::Node::Attribute;
|
---|
15 | use XML::XPath::Node::Namespace;
|
---|
16 |
|
---|
17 | my @options = qw(
|
---|
18 | filename
|
---|
19 | xml
|
---|
20 | parser
|
---|
21 | ioref
|
---|
22 | );
|
---|
23 |
|
---|
24 | my ($_current, $_namespaces_on);
|
---|
25 | my %IdNames;
|
---|
26 |
|
---|
27 | use vars qw/$xmlns_ns $xml_ns/;
|
---|
28 |
|
---|
29 | $xmlns_ns = "http://www.w3.org/2000/xmlns/";
|
---|
30 | $xml_ns = "http://www.w3.org/XML/1998/namespace";
|
---|
31 |
|
---|
32 | sub new {
|
---|
33 | my $proto = shift;
|
---|
34 | my $class = ref($proto) || $proto;
|
---|
35 | my %args = @_;
|
---|
36 | my %hash = map(( "_$_" => $args{$_} ), @options);
|
---|
37 | bless \%hash, $class;
|
---|
38 | }
|
---|
39 |
|
---|
40 | sub parse {
|
---|
41 | my $self = shift;
|
---|
42 |
|
---|
43 | $self->{IdNames} = {};
|
---|
44 | $self->{InScopeNamespaceStack} = [ {
|
---|
45 | '_Default' => undef,
|
---|
46 | 'xmlns' => $xmlns_ns,
|
---|
47 | 'xml' => $xml_ns,
|
---|
48 | } ];
|
---|
49 |
|
---|
50 | $self->{NodeStack} = [ ];
|
---|
51 |
|
---|
52 | $self->set_xml($_[0]) if $_[0];
|
---|
53 |
|
---|
54 | my $parser = $self->get_parser || XML::Parser->new(
|
---|
55 | ErrorContext => 2,
|
---|
56 | ParseParamEnt => 1,
|
---|
57 | );
|
---|
58 |
|
---|
59 | $parser->setHandlers(
|
---|
60 | Init => sub { $self->parse_init(@_) },
|
---|
61 | Char => sub { $self->parse_char(@_) },
|
---|
62 | Start => sub { $self->parse_start(@_) },
|
---|
63 | End => sub { $self->parse_end(@_) },
|
---|
64 | Final => sub { $self->parse_final(@_) },
|
---|
65 | Proc => sub { $self->parse_pi(@_) },
|
---|
66 | Comment => sub { $self->parse_comment(@_) },
|
---|
67 | Attlist => sub { $self->parse_attlist(@_) },
|
---|
68 | );
|
---|
69 |
|
---|
70 | my $toparse;
|
---|
71 | if ($toparse = $self->get_filename) {
|
---|
72 | return $parser->parsefile($toparse);
|
---|
73 | }
|
---|
74 | else {
|
---|
75 | return $parser->parse($self->get_xml || $self->get_ioref);
|
---|
76 | }
|
---|
77 | }
|
---|
78 |
|
---|
79 | sub parsefile {
|
---|
80 | my $self = shift;
|
---|
81 | my ($filename) = @_;
|
---|
82 | $self->set_filename($filename);
|
---|
83 | $self->parse;
|
---|
84 | }
|
---|
85 |
|
---|
86 | sub parse_init {
|
---|
87 | my $self = shift;
|
---|
88 | my $e = shift;
|
---|
89 | my $document = XML::XPath::Node::Element->new();
|
---|
90 | my $newns = XML::XPath::Node::Namespace->new('xml', $xml_ns);
|
---|
91 | $document->appendNamespace($newns);
|
---|
92 | $self->{current} = $self->{DOC_Node} = $document;
|
---|
93 | }
|
---|
94 |
|
---|
95 | sub parse_final {
|
---|
96 | my $self = shift;
|
---|
97 | return $self->{DOC_Node};
|
---|
98 | }
|
---|
99 |
|
---|
100 | sub parse_char {
|
---|
101 | my $self = shift;
|
---|
102 | my $e = shift;
|
---|
103 | my $text = shift;
|
---|
104 |
|
---|
105 | my $parent = $self->{current};
|
---|
106 |
|
---|
107 | my $last = $parent->getLastChild;
|
---|
108 | if ($last && $last->isTextNode) {
|
---|
109 | # append to previous text node
|
---|
110 | $last->appendText($text);
|
---|
111 | return;
|
---|
112 | }
|
---|
113 |
|
---|
114 | my $node = XML::XPath::Node::Text->new($text);
|
---|
115 | $parent->appendChild($node, 1);
|
---|
116 | }
|
---|
117 |
|
---|
118 | sub parse_start {
|
---|
119 | my $self = shift;
|
---|
120 | my $e = shift;
|
---|
121 | my $tag = shift;
|
---|
122 |
|
---|
123 | push @{ $self->{InScopeNamespaceStack} },
|
---|
124 | { %{ $self->{InScopeNamespaceStack}[-1] } };
|
---|
125 | $self->_scan_namespaces(@_);
|
---|
126 |
|
---|
127 | my ($prefix, $namespace) = $self->_namespace($tag);
|
---|
128 |
|
---|
129 | my $node = XML::XPath::Node::Element->new($tag, $prefix);
|
---|
130 |
|
---|
131 | my @attributes;
|
---|
132 | for (my $ii = 0; $ii < $#_; $ii += 2) {
|
---|
133 | my ($name, $value) = ($_[$ii], $_[$ii+1]);
|
---|
134 | if ($name =~ /^xmlns(:(.*))?$/) {
|
---|
135 | # namespace node
|
---|
136 | my $prefix = $2 || '#default';
|
---|
137 | # warn "Creating NS node: $prefix = $value\n";
|
---|
138 | my $newns = XML::XPath::Node::Namespace->new($prefix, $value);
|
---|
139 | $node->appendNamespace($newns);
|
---|
140 | }
|
---|
141 | else {
|
---|
142 | my ($prefix, $namespace) = $self->_namespace($name);
|
---|
143 | undef $namespace unless $prefix;
|
---|
144 |
|
---|
145 | my $newattr = XML::XPath::Node::Attribute->new($name, $value, $prefix);
|
---|
146 | $node->appendAttribute($newattr, 1);
|
---|
147 | if (exists($self->{IdNames}{$tag}) && ($self->{IdNames}{$tag} eq $name)) {
|
---|
148 | # warn "appending Id Element: $val for ", $node->getName, "\n";
|
---|
149 | $self->{DOC_Node}->appendIdElement($value, $node);
|
---|
150 | }
|
---|
151 | }
|
---|
152 | }
|
---|
153 |
|
---|
154 | $self->{current}->appendChild($node, 1);
|
---|
155 | $self->{current} = $node;
|
---|
156 | }
|
---|
157 |
|
---|
158 | sub parse_end {
|
---|
159 | my $self = shift;
|
---|
160 | my $e = shift;
|
---|
161 | $self->{current} = $self->{current}->getParentNode;
|
---|
162 | }
|
---|
163 |
|
---|
164 | sub parse_pi {
|
---|
165 | my $self = shift;
|
---|
166 | my $e = shift;
|
---|
167 | my ($target, $data) = @_;
|
---|
168 | my $node = XML::XPath::Node::PI->new($target, $data);
|
---|
169 | $self->{current}->appendChild($node, 1);
|
---|
170 | }
|
---|
171 |
|
---|
172 | sub parse_comment {
|
---|
173 | my $self = shift;
|
---|
174 | my $e = shift;
|
---|
175 | my ($data) = @_;
|
---|
176 | my $node = XML::XPath::Node::Comment->new($data);
|
---|
177 | $self->{current}->appendChild($node, 1);
|
---|
178 | }
|
---|
179 |
|
---|
180 | sub parse_attlist {
|
---|
181 | my $self = shift;
|
---|
182 | my $e = shift;
|
---|
183 | my ($elname, $attname, $type, $default, $fixed) = @_;
|
---|
184 | if ($type eq 'ID') {
|
---|
185 | $self->{IdNames}{$elname} = $attname;
|
---|
186 | }
|
---|
187 | }
|
---|
188 |
|
---|
189 | sub _scan_namespaces {
|
---|
190 | my ($self, %attributes) = @_;
|
---|
191 |
|
---|
192 | while (my ($attr_name, $value) = each %attributes) {
|
---|
193 | if ($attr_name eq 'xmlns') {
|
---|
194 | $self->{InScopeNamespaceStack}[-1]{'_Default'} = $value;
|
---|
195 | } elsif ($attr_name =~ /^xmlns:(.*)$/) {
|
---|
196 | my $prefix = $1;
|
---|
197 | $self->{InScopeNamespaceStack}[-1]{$prefix} = $value;
|
---|
198 | }
|
---|
199 | }
|
---|
200 | }
|
---|
201 |
|
---|
202 | sub _namespace {
|
---|
203 | my ($self, $name) = @_;
|
---|
204 |
|
---|
205 | my ($prefix, $localname) = split(/:/, $name);
|
---|
206 | if (!defined($localname)) {
|
---|
207 | if ($prefix eq 'xmlns') {
|
---|
208 | return '', undef;
|
---|
209 | } else {
|
---|
210 | return '', $self->{InScopeNamespaceStack}[-1]{'_Default'};
|
---|
211 | }
|
---|
212 | } else {
|
---|
213 | return $prefix, $self->{InScopeNamespaceStack}[-1]{$prefix};
|
---|
214 | }
|
---|
215 | }
|
---|
216 |
|
---|
217 | sub as_string {
|
---|
218 | my $node = shift;
|
---|
219 | $node->toString;
|
---|
220 | }
|
---|
221 |
|
---|
222 | sub get_parser { shift->{_parser}; }
|
---|
223 | sub get_filename { shift->{_filename}; }
|
---|
224 | sub get_xml { shift->{_xml}; }
|
---|
225 | sub get_ioref { shift->{_ioref}; }
|
---|
226 |
|
---|
227 | sub set_parser { $_[0]->{_parser} = $_[1]; }
|
---|
228 | sub set_filename { $_[0]->{_filename} = $_[1]; }
|
---|
229 | sub set_xml { $_[0]->{_xml} = $_[1]; }
|
---|
230 | sub set_ioref { $_[0]->{_ioref} = $_[1]; }
|
---|
231 |
|
---|
232 | 1;
|
---|
233 |
|
---|
234 | __END__
|
---|
235 |
|
---|
236 | =head1 NAME
|
---|
237 |
|
---|
238 | XML::XPath::XMLParser - The default XML parsing class that produces a node tree
|
---|
239 |
|
---|
240 | =head1 SYNOPSIS
|
---|
241 |
|
---|
242 | my $parser = XML::XPath::XMLParser->new(
|
---|
243 | filename => $self->get_filename,
|
---|
244 | xml => $self->get_xml,
|
---|
245 | ioref => $self->get_ioref,
|
---|
246 | parser => $self->get_parser,
|
---|
247 | );
|
---|
248 | my $root_node = $parser->parse;
|
---|
249 |
|
---|
250 | =head1 DESCRIPTION
|
---|
251 |
|
---|
252 | This module generates a node tree for use as the context node for XPath processing.
|
---|
253 | It aims to be a quick parser, nothing fancy, and yet has to store more information
|
---|
254 | than most parsers. To achieve this I've used array refs everywhere - no hashes.
|
---|
255 | I don't have any performance figures for the speedups achieved, so I make no
|
---|
256 | appologies for anyone not used to using arrays instead of hashes. I think they
|
---|
257 | make good sense here where we know the attributes of each type of node.
|
---|
258 |
|
---|
259 | =head1 Node Structure
|
---|
260 |
|
---|
261 | All nodes have the same first 2 entries in the array: node_parent
|
---|
262 | and node_pos. The type of the node is determined using the ref() function.
|
---|
263 | The node_parent always contains an entry for the parent of the current
|
---|
264 | node - except for the root node which has undef in there. And node_pos is the
|
---|
265 | position of this node in the array that it is in (think:
|
---|
266 | $node == $node->[node_parent]->[node_children]->[$node->[node_pos]] )
|
---|
267 |
|
---|
268 | Nodes are structured as follows:
|
---|
269 |
|
---|
270 | =head2 Root Node
|
---|
271 |
|
---|
272 | The root node is just an element node with no parent.
|
---|
273 |
|
---|
274 | [
|
---|
275 | undef, # node_parent - check for undef to identify root node
|
---|
276 | undef, # node_pos
|
---|
277 | undef, # node_prefix
|
---|
278 | [ ... ], # node_children (see below)
|
---|
279 | ]
|
---|
280 |
|
---|
281 | =head2 Element Node
|
---|
282 |
|
---|
283 | [
|
---|
284 | $parent, # node_parent
|
---|
285 | <position in current array>, # node_pos
|
---|
286 | 'xxx', # node_prefix - namespace prefix on this element
|
---|
287 | [ ... ], # node_children
|
---|
288 | 'yyy', # node_name - element tag name
|
---|
289 | [ ... ], # node_attribs - attributes on this element
|
---|
290 | [ ... ], # node_namespaces - namespaces currently in scope
|
---|
291 | ]
|
---|
292 |
|
---|
293 | =head2 Attribute Node
|
---|
294 |
|
---|
295 | [
|
---|
296 | $parent, # node_parent - the element node
|
---|
297 | <position in current array>, # node_pos
|
---|
298 | 'xxx', # node_prefix - namespace prefix on this element
|
---|
299 | 'href', # node_key - attribute name
|
---|
300 | 'ftp://ftp.com/', # node_value - value in the node
|
---|
301 | ]
|
---|
302 |
|
---|
303 | =head2 Namespace Nodes
|
---|
304 |
|
---|
305 | Each element has an associated set of namespace nodes that are currently
|
---|
306 | in scope. Each namespace node stores a prefix and the expanded name (retrieved
|
---|
307 | from the xmlns:prefix="..." attribute).
|
---|
308 |
|
---|
309 | [
|
---|
310 | $parent,
|
---|
311 | <pos>,
|
---|
312 | 'a', # node_prefix - the namespace as it was written as a prefix
|
---|
313 | 'http://my.namespace.com', # node_expanded - the expanded name.
|
---|
314 | ]
|
---|
315 |
|
---|
316 | =head2 Text Nodes
|
---|
317 |
|
---|
318 | [
|
---|
319 | $parent,
|
---|
320 | <pos>,
|
---|
321 | 'This is some text' # node_text - the text in the node
|
---|
322 | ]
|
---|
323 |
|
---|
324 | =head2 Comment Nodes
|
---|
325 |
|
---|
326 | [
|
---|
327 | $parent,
|
---|
328 | <pos>,
|
---|
329 | 'This is a comment' # node_comment
|
---|
330 | ]
|
---|
331 |
|
---|
332 | =head2 Processing Instruction Nodes
|
---|
333 |
|
---|
334 | [
|
---|
335 | $parent,
|
---|
336 | <pos>,
|
---|
337 | 'target', # node_target
|
---|
338 | 'data', # node_data
|
---|
339 | ]
|
---|
340 |
|
---|
341 | =head1 Usage
|
---|
342 |
|
---|
343 | If you feel the need to use this module outside of XML::XPath (for example
|
---|
344 | you might use this module directly so that you can cache parsed trees), you
|
---|
345 | can follow the following API:
|
---|
346 |
|
---|
347 | =head2 new
|
---|
348 |
|
---|
349 | The new method takes either no parameters, or any of the following parameters:
|
---|
350 |
|
---|
351 | filename
|
---|
352 | xml
|
---|
353 | parser
|
---|
354 | ioref
|
---|
355 |
|
---|
356 | This uses the familiar hash syntax, so an example might be:
|
---|
357 |
|
---|
358 | use XML::XPath::XMLParser;
|
---|
359 |
|
---|
360 | my $parser = XML::XPath::XMLParser->new(filename => 'example.xml');
|
---|
361 |
|
---|
362 | The parameters represent a filename, a string containing XML, an XML::Parser
|
---|
363 | instance and an open filehandle ref respectively. You can also set or get all
|
---|
364 | of these properties using the get_ and set_ functions that have the same
|
---|
365 | name as the property: e.g. get_filename, set_ioref, etc.
|
---|
366 |
|
---|
367 | =head2 parse
|
---|
368 |
|
---|
369 | The parse method generally takes no parameters, however you are free to
|
---|
370 | pass either an open filehandle reference or an XML string if you so require.
|
---|
371 | The return value is a tree that XML::XPath can use. The parse method will
|
---|
372 | die if there is an error in your XML, so be sure to use perl's exception
|
---|
373 | handling mechanism (eval{};) if you want to avoid this.
|
---|
374 |
|
---|
375 | =head2 parsefile
|
---|
376 |
|
---|
377 | The parsefile method is identical to parse() except it expects a single
|
---|
378 | parameter that is a string naming a file to open and parse. Again it
|
---|
379 | returns a tree and also dies if there are XML errors.
|
---|
380 |
|
---|
381 | =head1 NOTICES
|
---|
382 |
|
---|
383 | This file is distributed as part of the XML::XPath module, and is copyright
|
---|
384 | 2000 Fastnet Software Ltd. Please see the documentation for the module as a
|
---|
385 | whole for licencing information.
|
---|