source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 3517

Last change on this file since 3517 was 3107, checked in by jrm21, 22 years ago

fixed problem where documents after a "bad" document would not be
read properly. We now clean up properly after a bad document.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XML::Parser;
37
38my ($self);
39sub new {
40 my $class = shift (@_);
41
42 # $self is global for use within subroutines called by XML::Parser
43 $self = new BasPlug ($class, @_);
44
45 my $parser = new XML::Parser('Style' => 'Stream',
46 'Handlers' => {'Char' => \&Char,
47 'XMLDecl' => \&XMLDecl,
48 'Entity' => \&Entity,
49 'Doctype' => \&Doctype,
50 'Default' => \&Default
51 }
52 );
53 $self->{'parser'} = $parser;
54
55 return bless $self, $class;
56}
57
58
59sub read {
60 # this must be global!
61 $self = shift (@_);
62
63 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
64
65 my $filename = $file;
66 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
67
68 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
69 $self->{'num_blocked'} ++;
70 return 0;
71 }
72 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
73 return undef;
74 }
75 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
76 $self->{'file'} = $file;
77 $self->{'filename'} = $filename;
78 $self->{'processor'} = $processor;
79 $self->{'metadata'} = $metadata;
80
81 eval {
82 $self->{'parser'}->parsefile($filename);
83 };
84 if ($@) {
85
86 # parsefile may either croak somewhere in XML::Parser (e.g. because
87 # the document is not well formed) or die somewhere in XMLPlug or a
88 # derived plugin (e.g. because we're attempting to process a
89 # document whose DOCTYPE is not meant for this plugin). For the
90 # first case we'll print a warning and continue, for the second
91 # we'll just continue quietly
92
93 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
94 if (defined $msg) {
95 my $outhandle = $self->{'outhandle'};
96 my $plugin_name = ref ($self);
97 print $outhandle "$plugin_name failed to process $file ($msg)\n";
98 }
99 # reset ourself for the next document
100 $self->{'section_level'}=0;
101 return undef;
102 }
103
104 return 1; # processed the file
105}
106
107sub get_default_process_exp {
108 my $self = shift (@_);
109
110 return q^(?i)\.xml$^;
111}
112
113sub StartDocument {$self->xml_start_document(@_);}
114sub XMLDecl {$self->xml_xmldecl(@_);}
115sub Entity {$self->xml_entity(@_);}
116sub Doctype {$self->xml_doctype(@_);}
117sub StartTag {$self->xml_start_tag(@_);}
118sub EndTag {$self->xml_end_tag(@_);}
119sub Text {$self->xml_text(@_);}
120sub PI {$self->xml_pi(@_);}
121sub EndDocument {$self->xml_end_document(@_);}
122sub Default {$self->xml_default(@_);}
123
124# This Char function overrides the one in XML::Parser::Stream to overcome a
125# problem where $expat->{Text} is treated as the return value, slowing
126# things down significantly in some cases.
127sub Char {
128 $_[0]->{'Text'} .= $_[1];
129 return undef;
130}
131
132# Called at the beginning of the XML document.
133sub xml_start_document {
134 my $self = shift(@_);
135 my ($expat) = @_;
136
137 $self->open_document();
138}
139
140# Called for XML declarations
141sub xml_xmldecl {
142 my $self = shift(@_);
143 my ($expat, $version, $encoding, $standalone) = @_;
144}
145
146# Called for XML entities
147sub xml_entity {
148 my $self = shift(@_);
149 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
150}
151
152# Called for DOCTYPE declarations - use die to bail out if this doctype
153# is not meant for this plugin
154sub xml_doctype {
155 my $self = shift(@_);
156 my ($expat, $name, $sysid, $pubid, $internal) = @_;
157 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
158}
159
160# Called for every start tag. The $_ variable will contain a copy of the
161# tag and the %_ variable will contain the element's attributes.
162sub xml_start_tag {
163 my $self = shift(@_);
164 my ($expat, $element) = @_;
165}
166
167# Called for every end tag. The $_ variable will contain a copy of the tag.
168sub xml_end_tag {
169 my $self = shift(@_);
170 my ($expat, $element) = @_;
171}
172
173# Called just before start or end tags with accumulated non-markup text in
174# the $_ variable.
175sub xml_text {
176 my $self = shift(@_);
177 my ($expat) = @_;
178}
179
180# Called for processing instructions. The $_ variable will contain a copy
181# of the pi.
182sub xml_pi {
183 my $self = shift(@_);
184 my ($expat, $target, $data) = @_;
185}
186
187# Called at the end of the XML document.
188sub xml_end_document {
189 my $self = shift(@_);
190 my ($expat) = @_;
191
192 $self->close_document();
193}
194
195# Called for any characters not handled by the above functions.
196sub xml_default {
197 my $self = shift(@_);
198 my ($expat, $text) = @_;
199}
200
201sub open_document {
202 my $self = shift(@_);
203
204 # create a new document
205 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
206 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
207}
208
209sub close_document {
210 my $self = shift(@_);
211
212 # include any metadata passed in from previous plugins
213 # note that this metadata is associated with the top level section
214 $self->extra_metadata ($self->{'doc_obj'},
215 $self->{'doc_obj'}->get_top_section(),
216 $self->{'metadata'});
217
218 # do any automatic metadata extraction
219 $self->auto_extract_metadata ($self->{'doc_obj'});
220
221 # add an OID
222 $self->{'doc_obj'}->set_OID();
223
224 # process the document
225 $self->{'processor'}->process($self->{'doc_obj'});
226
227 $self->{'num_processed'} ++;
228}
229
2301;
231
Note: See TracBrowser for help on using the repository browser.