source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 9462

Last change on this file since 9462 was 9462, checked in by mdewsnip, 19 years ago

Added "use bytes" in all XML::Parser Char functions. This is to prevent nasty encoding problems when using XML::Parser versions 2.31+. See BasPlug's revision 1.58 log message for more information on this problem.

With much help from John McPherson.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @XMLPlug::ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XMLParser;
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" } ];
44
45my $options = { 'name' => "XMLPlug",
46 'desc' => "{XMLPlug.desc}",
47 'abstract' => "yes",
48 'inherits' => "yes",
49 'args' => $arguments };
50
51
52my ($self);
53sub new {
54 my $class = shift (@_);
55
56 # $self is global for use within subroutines called by XML::Parser
57 $self = new BasPlug ($class, @_);
58 $self->{'plugin_type'} = "XMLPlug";
59 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
60 my $option_list = $self->{'option_list'};
61 push( @{$option_list}, $options );
62
63 my $parser = new XML::Parser('Style' => 'Stream',
64 'Handlers' => {'Char' => \&Char,
65 'XMLDecl' => \&XMLDecl,
66 'Entity' => \&Entity,
67 'Doctype' => \&Doctype,
68 'Default' => \&Default
69 });
70
71
72
73 $self->{'parser'} = $parser;
74
75 return bless $self, $class;
76}
77
78
79sub read {
80 # this must be global!
81 $self = shift (@_);
82
83 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
84
85 my $filename = $file;
86 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
87
88 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
89 $self->{'num_blocked'} ++;
90 return 0;
91 }
92 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
93 return undef;
94 }
95 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
96 $self->{'file'} = $file;
97 $self->{'filename'} = $filename;
98 $self->{'processor'} = $processor;
99 $self->{'metadata'} = $metadata;
100
101 eval {
102 $self->{'parser'}->parsefile($filename);
103 };
104
105 if ($@) {
106
107 # parsefile may either croak somewhere in XML::Parser (e.g. because
108 # the document is not well formed) or die somewhere in XMLPlug or a
109 # derived plugin (e.g. because we're attempting to process a
110 # document whose DOCTYPE is not meant for this plugin). For the
111 # first case we'll print a warning and continue, for the second
112 # we'll just continue quietly
113
114 ## print STDERR "**** Error is: $@\n";
115
116 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
117 if (defined $msg) {
118 my $outhandle = $self->{'outhandle'};
119 my $plugin_name = ref ($self);
120 print $outhandle "$plugin_name failed to process $file ($msg)\n";
121 }
122
123 # reset ourself for the next document
124 $self->{'section_level'}=0;
125 return -1; # error during processing
126 }
127 return 1; # processed the file
128}
129
130sub get_default_process_exp {
131 my $self = shift (@_);
132
133 return q^(?i)\.xml$^;
134}
135
136sub StartDocument {$self->xml_start_document(@_);}
137sub XMLDecl {$self->xml_xmldecl(@_);}
138sub Entity {$self->xml_entity(@_);}
139sub Doctype {$self->xml_doctype(@_);}
140sub StartTag {$self->xml_start_tag(@_);}
141sub EndTag {$self->xml_end_tag(@_);}
142sub Text {$self->xml_text(@_);}
143sub PI {$self->xml_pi(@_);}
144sub EndDocument {$self->xml_end_document(@_);}
145sub Default {$self->xml_default(@_);}
146
147# This Char function overrides the one in XML::Parser::Stream to overcome a
148# problem where $expat->{Text} is treated as the return value, slowing
149# things down significantly in some cases.
150sub Char {
151 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
152 $_[0]->{'Text'} .= $_[1];
153 return undef;
154}
155
156# Called at the beginning of the XML document.
157sub xml_start_document {
158 my $self = shift(@_);
159 my ($expat) = @_;
160
161 $self->open_document();
162}
163
164# Called for XML declarations
165sub xml_xmldecl {
166 my $self = shift(@_);
167 my ($expat, $version, $encoding, $standalone) = @_;
168}
169
170# Called for XML entities
171sub xml_entity {
172 my $self = shift(@_);
173 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
174}
175
176# Called for DOCTYPE declarations - use die to bail out if this doctype
177# is not meant for this plugin
178sub xml_doctype {
179 my $self = shift(@_);
180 my ($expat, $name, $sysid, $pubid, $internal) = @_;
181 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
182}
183
184# Called for every start tag. The $_ variable will contain a copy of the
185# tag and the %_ variable will contain the element's attributes.
186sub xml_start_tag {
187 my $self = shift(@_);
188 my ($expat, $element) = @_;
189}
190
191# Called for every end tag. The $_ variable will contain a copy of the tag.
192sub xml_end_tag {
193 my $self = shift(@_);
194 my ($expat, $element) = @_;
195}
196
197# Called just before start or end tags with accumulated non-markup text in
198# the $_ variable.
199sub xml_text {
200 my $self = shift(@_);
201 my ($expat) = @_;
202}
203
204# Called for processing instructions. The $_ variable will contain a copy
205# of the pi.
206sub xml_pi {
207 my $self = shift(@_);
208 my ($expat, $target, $data) = @_;
209}
210
211# Called at the end of the XML document.
212sub xml_end_document {
213 my $self = shift(@_);
214 my ($expat) = @_;
215
216 $self->close_document();
217}
218
219# Called for any characters not handled by the above functions.
220sub xml_default {
221 my $self = shift(@_);
222 my ($expat, $text) = @_;
223}
224
225sub open_document {
226 my $self = shift(@_);
227
228 # create a new document
229 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
230 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
231}
232
233sub close_document {
234 my $self = shift(@_);
235 my $doc_obj = $self->{'doc_obj'};
236 # include any metadata passed in from previous plugins
237 # note that this metadata is associated with the top level section
238 $self->extra_metadata ($doc_obj,
239 $doc_obj->get_top_section(),
240 $self->{'metadata'});
241
242 # do any automatic metadata extraction
243 $self->auto_extract_metadata ($doc_obj);
244
245 # add an OID
246 $doc_obj->set_OID();
247
248 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
249 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
250
251 # process the document
252 $self->{'processor'}->process($doc_obj);
253
254 $self->{'num_processed'} ++;
255}
256
2571;
258
259
260
261
Note: See TracBrowser for help on using the repository browser.