source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 2810

Last change on this file since 2810 was 2810, checked in by sjboddie, 23 years ago

Created GAPlug (and XMLPlug base class) to replace the old GMLPlug.
Greenstone archives will now be stored as proper XML documents (with .xml
file extension) instead of the old .gml files.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XML::Parser;
37use XML::Parser::Expat;
38
39my ($self);
40sub new {
41 my $class = shift (@_);
42
43 # $self is global for use within subroutines called by XML::Parser
44 $self = new BasPlug ($class, @_);
45
46 my $parser = new XML::Parser('Style' => 'Stream',
47 'Handlers' => {'Char' => \&Char,
48 'XMLDecl' => \&XMLDecl,
49 'Doctype' => \&Doctype,
50 'Default' => \&Default
51 }
52 );
53 $self->{'parser'} = $parser;
54
55 return bless $self, $class;
56}
57
58
59sub read {
60 my $self = shift (@_);
61
62 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
63
64 my $filename = $file;
65 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
66
67 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
68 $self->{'num_blocked'} ++;
69 return 0;
70 }
71 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
72 return undef;
73 }
74 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
75 $self->{'file'} = $file;
76 $self->{'filename'} = $filename;
77 $self->{'processor'} = $processor;
78 $self->{'metadata'} = $metadata;
79
80 eval {
81 $self->{'parser'}->parsefile($filename);
82 };
83 if ($@) {
84
85 # parsefile may either croak somewhere in XML::Parser (e.g. because
86 # the document is not well formed) or die somewhere in XMLPlug or a
87 # derived plugin (e.g. because we're attempting to process a
88 # document whose DOCTYPE is not meant for this plugin). For the
89 # first case we'll print a warning and continue, for the second
90 # we'll just continue quietly
91
92 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
93 if (defined $msg) {
94 my $outhandle = $self->{'outhandle'};
95 my $plugin_name = ref ($self);
96 print $outhandle "$plugin_name failed to process $file ($msg)\n";
97 }
98
99 print $@;
100
101 return undef;
102 }
103
104 return 1; # processed the file
105}
106
107sub get_default_process_exp {
108 my $self = shift (@_);
109
110 return q^(?i)\.xml$^;
111}
112
113sub StartDocument {$self->xml_start_document(@_);}
114sub XMLDecl {$self->xml_xmldecl(@_);}
115sub Doctype {$self->xml_doctype(@_);}
116sub StartTag {$self->xml_start_tag(@_);}
117sub EndTag {$self->xml_end_tag(@_);}
118sub Text {$self->xml_text(@_);}
119sub PI {$self->xml_pi(@_);}
120sub EndDocument {$self->xml_end_document(@_);}
121sub Default {$self->xml_default(@_);}
122
123# This Char function overrides the one in XML::Parser::Stream to overcome a
124# problem where $expat->{Text} is treated as the return value, slowing
125# things down significantly in some cases.
126sub Char {
127 $_[0]->{'Text'} .= $_[1];
128 return undef;
129}
130
131# Called at the beginning of the XML document.
132sub xml_start_document {
133 my $self = shift(@_);
134 my ($expat) = @_;
135
136 $self->open_document();
137}
138
139# Called for XML declarations
140sub xml_xmldecl {
141 my $self = shift(@_);
142 my ($expat, $version, $encoding, $standalone) = @_;
143}
144
145# Called for DOCTYPE declarations - use die to bail out if this doctype
146# is not meant for this plugin
147sub xml_doctype {
148 my $self = shift(@_);
149 my ($expat, $name, $sysid, $pubid, $internal) = @_;
150 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
151}
152
153# Called for every start tag. The $_ variable will contain a copy of the
154# tag and the %_ variable will contain the element's attributes.
155sub xml_start_tag {
156 my $self = shift(@_);
157 my ($expat, $element) = @_;
158}
159
160# Called for every end tag. The $_ variable will contain a copy of the tag.
161sub xml_end_tag {
162 my $self = shift(@_);
163 my ($expat, $element) = @_;
164}
165
166# Called just before start or end tags with accumulated non-markup text in
167# the $_ variable.
168sub xml_text {
169 my $self = shift(@_);
170 my ($expat) = @_;
171}
172
173# Called for processing instructions. The $_ variable will contain a copy
174# of the pi.
175sub xml_pi {
176 my $self = shift(@_);
177 my ($expat, $target, $data) = @_;
178}
179
180# Called at the end of the XML document.
181sub xml_end_document {
182 my $self = shift(@_);
183 my ($expat) = @_;
184
185 $self->close_document();
186}
187
188# Called for any characters not handled by the above functions.
189sub xml_default {
190 my $self = shift(@_);
191 my ($expat, $text) = @_;
192}
193
194sub open_document {
195 my $self = shift(@_);
196
197 # create a new document
198 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
199 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
200}
201
202sub close_document {
203 my $self = shift(@_);
204
205 # include any metadata passed in from previous plugins
206 # note that this metadata is associated with the top level section
207 $self->extra_metadata ($self->{'doc_obj'},
208 $self->{'doc_obj'}->get_top_section(),
209 $self->{'metadata'});
210
211 # do any automatic metadata extraction
212 $self->auto_extract_metadata ($self->{'doc_obj'});
213
214 # add an OID
215 $self->{'doc_obj'}->set_OID();
216
217 # process the document
218 $self->{'processor'}->process($self->{'doc_obj'});
219
220 $self->{'num_processed'} ++;
221}
222
2231;
224
Note: See TracBrowser for help on using the repository browser.