source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/XML/XPath/XMLParser.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 10.2 KB
Line 
1# $Id: XMLParser.pm 7909 2004-08-06 05:11:55Z mdewsnip $
2
3package XML::XPath::XMLParser;
4
5use strict;
6
7use XML::Parser;
8#use XML::XPath;
9use XML::XPath::Node;
10use XML::XPath::Node::Element;
11use XML::XPath::Node::Text;
12use XML::XPath::Node::Comment;
13use XML::XPath::Node::PI;
14use XML::XPath::Node::Attribute;
15use XML::XPath::Node::Namespace;
16
17my @options = qw(
18 filename
19 xml
20 parser
21 ioref
22 );
23
24my ($_current, $_namespaces_on);
25my %IdNames;
26
27use vars qw/$xmlns_ns $xml_ns/;
28
29$xmlns_ns = "http://www.w3.org/2000/xmlns/";
30$xml_ns = "http://www.w3.org/XML/1998/namespace";
31
32sub new {
33 my $proto = shift;
34 my $class = ref($proto) || $proto;
35 my %args = @_;
36 my %hash = map(( "_$_" => $args{$_} ), @options);
37 bless \%hash, $class;
38}
39
40sub parse {
41 my $self = shift;
42
43 $self->{IdNames} = {};
44 $self->{InScopeNamespaceStack} = [ {
45 '_Default' => undef,
46 'xmlns' => $xmlns_ns,
47 'xml' => $xml_ns,
48 } ];
49
50 $self->{NodeStack} = [ ];
51
52 $self->set_xml($_[0]) if $_[0];
53
54 my $parser = $self->get_parser || XML::Parser->new(
55 ErrorContext => 2,
56 ParseParamEnt => 1,
57 );
58
59 $parser->setHandlers(
60 Init => sub { $self->parse_init(@_) },
61 Char => sub { $self->parse_char(@_) },
62 Start => sub { $self->parse_start(@_) },
63 End => sub { $self->parse_end(@_) },
64 Final => sub { $self->parse_final(@_) },
65 Proc => sub { $self->parse_pi(@_) },
66 Comment => sub { $self->parse_comment(@_) },
67 Attlist => sub { $self->parse_attlist(@_) },
68 );
69
70 my $toparse;
71 if ($toparse = $self->get_filename) {
72 return $parser->parsefile($toparse);
73 }
74 else {
75 return $parser->parse($self->get_xml || $self->get_ioref);
76 }
77}
78
79sub parsefile {
80 my $self = shift;
81 my ($filename) = @_;
82 $self->set_filename($filename);
83 $self->parse;
84}
85
86sub parse_init {
87 my $self = shift;
88 my $e = shift;
89 my $document = XML::XPath::Node::Element->new();
90 my $newns = XML::XPath::Node::Namespace->new('xml', $xml_ns);
91 $document->appendNamespace($newns);
92 $self->{current} = $self->{DOC_Node} = $document;
93}
94
95sub parse_final {
96 my $self = shift;
97 return $self->{DOC_Node};
98}
99
100sub parse_char {
101 my $self = shift;
102 my $e = shift;
103 my $text = shift;
104
105 my $parent = $self->{current};
106
107 my $last = $parent->getLastChild;
108 if ($last && $last->isTextNode) {
109 # append to previous text node
110 $last->appendText($text);
111 return;
112 }
113
114 my $node = XML::XPath::Node::Text->new($text);
115 $parent->appendChild($node, 1);
116}
117
118sub parse_start {
119 my $self = shift;
120 my $e = shift;
121 my $tag = shift;
122
123 push @{ $self->{InScopeNamespaceStack} },
124 { %{ $self->{InScopeNamespaceStack}[-1] } };
125 $self->_scan_namespaces(@_);
126
127 my ($prefix, $namespace) = $self->_namespace($tag);
128
129 my $node = XML::XPath::Node::Element->new($tag, $prefix);
130
131 my @attributes;
132 for (my $ii = 0; $ii < $#_; $ii += 2) {
133 my ($name, $value) = ($_[$ii], $_[$ii+1]);
134 if ($name =~ /^xmlns(:(.*))?$/) {
135 # namespace node
136 my $prefix = $2 || '#default';
137# warn "Creating NS node: $prefix = $value\n";
138 my $newns = XML::XPath::Node::Namespace->new($prefix, $value);
139 $node->appendNamespace($newns);
140 }
141 else {
142 my ($prefix, $namespace) = $self->_namespace($name);
143 undef $namespace unless $prefix;
144
145 my $newattr = XML::XPath::Node::Attribute->new($name, $value, $prefix);
146 $node->appendAttribute($newattr, 1);
147 if (exists($self->{IdNames}{$tag}) && ($self->{IdNames}{$tag} eq $name)) {
148 # warn "appending Id Element: $val for ", $node->getName, "\n";
149 $self->{DOC_Node}->appendIdElement($value, $node);
150 }
151 }
152 }
153
154 $self->{current}->appendChild($node, 1);
155 $self->{current} = $node;
156}
157
158sub parse_end {
159 my $self = shift;
160 my $e = shift;
161 $self->{current} = $self->{current}->getParentNode;
162}
163
164sub parse_pi {
165 my $self = shift;
166 my $e = shift;
167 my ($target, $data) = @_;
168 my $node = XML::XPath::Node::PI->new($target, $data);
169 $self->{current}->appendChild($node, 1);
170}
171
172sub parse_comment {
173 my $self = shift;
174 my $e = shift;
175 my ($data) = @_;
176 my $node = XML::XPath::Node::Comment->new($data);
177 $self->{current}->appendChild($node, 1);
178}
179
180sub parse_attlist {
181 my $self = shift;
182 my $e = shift;
183 my ($elname, $attname, $type, $default, $fixed) = @_;
184 if ($type eq 'ID') {
185 $self->{IdNames}{$elname} = $attname;
186 }
187}
188
189sub _scan_namespaces {
190 my ($self, %attributes) = @_;
191
192 while (my ($attr_name, $value) = each %attributes) {
193 if ($attr_name eq 'xmlns') {
194 $self->{InScopeNamespaceStack}[-1]{'_Default'} = $value;
195 } elsif ($attr_name =~ /^xmlns:(.*)$/) {
196 my $prefix = $1;
197 $self->{InScopeNamespaceStack}[-1]{$prefix} = $value;
198 }
199 }
200}
201
202sub _namespace {
203 my ($self, $name) = @_;
204
205 my ($prefix, $localname) = split(/:/, $name);
206 if (!defined($localname)) {
207 if ($prefix eq 'xmlns') {
208 return '', undef;
209 } else {
210 return '', $self->{InScopeNamespaceStack}[-1]{'_Default'};
211 }
212 } else {
213 return $prefix, $self->{InScopeNamespaceStack}[-1]{$prefix};
214 }
215}
216
217sub as_string {
218 my $node = shift;
219 $node->toString;
220}
221
222sub get_parser { shift->{_parser}; }
223sub get_filename { shift->{_filename}; }
224sub get_xml { shift->{_xml}; }
225sub get_ioref { shift->{_ioref}; }
226
227sub set_parser { $_[0]->{_parser} = $_[1]; }
228sub set_filename { $_[0]->{_filename} = $_[1]; }
229sub set_xml { $_[0]->{_xml} = $_[1]; }
230sub set_ioref { $_[0]->{_ioref} = $_[1]; }
231
2321;
233
234__END__
235
236=head1 NAME
237
238XML::XPath::XMLParser - The default XML parsing class that produces a node tree
239
240=head1 SYNOPSIS
241
242 my $parser = XML::XPath::XMLParser->new(
243 filename => $self->get_filename,
244 xml => $self->get_xml,
245 ioref => $self->get_ioref,
246 parser => $self->get_parser,
247 );
248 my $root_node = $parser->parse;
249
250=head1 DESCRIPTION
251
252This module generates a node tree for use as the context node for XPath processing.
253It aims to be a quick parser, nothing fancy, and yet has to store more information
254than most parsers. To achieve this I've used array refs everywhere - no hashes.
255I don't have any performance figures for the speedups achieved, so I make no
256appologies for anyone not used to using arrays instead of hashes. I think they
257make good sense here where we know the attributes of each type of node.
258
259=head1 Node Structure
260
261All nodes have the same first 2 entries in the array: node_parent
262and node_pos. The type of the node is determined using the ref() function.
263The node_parent always contains an entry for the parent of the current
264node - except for the root node which has undef in there. And node_pos is the
265position of this node in the array that it is in (think:
266$node == $node->[node_parent]->[node_children]->[$node->[node_pos]] )
267
268Nodes are structured as follows:
269
270=head2 Root Node
271
272The root node is just an element node with no parent.
273
274 [
275 undef, # node_parent - check for undef to identify root node
276 undef, # node_pos
277 undef, # node_prefix
278 [ ... ], # node_children (see below)
279 ]
280
281=head2 Element Node
282
283 [
284 $parent, # node_parent
285 <position in current array>, # node_pos
286 'xxx', # node_prefix - namespace prefix on this element
287 [ ... ], # node_children
288 'yyy', # node_name - element tag name
289 [ ... ], # node_attribs - attributes on this element
290 [ ... ], # node_namespaces - namespaces currently in scope
291 ]
292
293=head2 Attribute Node
294
295 [
296 $parent, # node_parent - the element node
297 <position in current array>, # node_pos
298 'xxx', # node_prefix - namespace prefix on this element
299 'href', # node_key - attribute name
300 'ftp://ftp.com/', # node_value - value in the node
301 ]
302
303=head2 Namespace Nodes
304
305Each element has an associated set of namespace nodes that are currently
306in scope. Each namespace node stores a prefix and the expanded name (retrieved
307from the xmlns:prefix="..." attribute).
308
309 [
310 $parent,
311 <pos>,
312 'a', # node_prefix - the namespace as it was written as a prefix
313 'http://my.namespace.com', # node_expanded - the expanded name.
314 ]
315
316=head2 Text Nodes
317
318 [
319 $parent,
320 <pos>,
321 'This is some text' # node_text - the text in the node
322 ]
323
324=head2 Comment Nodes
325
326 [
327 $parent,
328 <pos>,
329 'This is a comment' # node_comment
330 ]
331
332=head2 Processing Instruction Nodes
333
334 [
335 $parent,
336 <pos>,
337 'target', # node_target
338 'data', # node_data
339 ]
340
341=head1 Usage
342
343If you feel the need to use this module outside of XML::XPath (for example
344you might use this module directly so that you can cache parsed trees), you
345can follow the following API:
346
347=head2 new
348
349The new method takes either no parameters, or any of the following parameters:
350
351 filename
352 xml
353 parser
354 ioref
355
356This uses the familiar hash syntax, so an example might be:
357
358 use XML::XPath::XMLParser;
359
360 my $parser = XML::XPath::XMLParser->new(filename => 'example.xml');
361
362The parameters represent a filename, a string containing XML, an XML::Parser
363instance and an open filehandle ref respectively. You can also set or get all
364of these properties using the get_ and set_ functions that have the same
365name as the property: e.g. get_filename, set_ioref, etc.
366
367=head2 parse
368
369The parse method generally takes no parameters, however you are free to
370pass either an open filehandle reference or an XML string if you so require.
371The return value is a tree that XML::XPath can use. The parse method will
372die if there is an error in your XML, so be sure to use perl's exception
373handling mechanism (eval{};) if you want to avoid this.
374
375=head2 parsefile
376
377The parsefile method is identical to parse() except it expects a single
378parameter that is a string naming a file to open and parse. Again it
379returns a tree and also dies if there are XML errors.
380
381=head1 NOTICES
382
383This file is distributed as part of the XML::XPath module, and is copyright
3842000 Fastnet Software Ltd. Please see the documentation for the module as a
385whole for licencing information.
Note: See TracBrowser for help on using the repository browser.