source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 4744

Last change on this file since 4744 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XML::Parser;
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
41 'type' => "string",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" } ];
44
45my $options = { 'name' => "XMLPlug",
46 'desc' => "Base class for XML plugins.",
47 'inherits' => "yes",
48 'args' => $arguments };
49
50
51my ($self);
52sub new {
53 my $class = shift (@_);
54
55 # $self is global for use within subroutines called by XML::Parser
56 $self = new BasPlug ($class, @_);
57
58
59 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
60 my $option_list = $self->{'option_list'};
61 push( @{$option_list}, $options );
62
63 my $parser = new XML::Parser('Style' => 'Stream',
64 'Handlers' => {'Char' => \&Char,
65 'XMLDecl' => \&XMLDecl,
66 'Entity' => \&Entity,
67 'Doctype' => \&Doctype,
68 'Default' => \&Default
69 }
70 );
71 $self->{'parser'} = $parser;
72
73 return bless $self, $class;
74}
75
76
77sub read {
78 # this must be global!
79 $self = shift (@_);
80
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
82
83 my $filename = $file;
84 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
85
86 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
87 $self->{'num_blocked'} ++;
88 return 0;
89 }
90 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
91 return undef;
92 }
93 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
94 $self->{'file'} = $file;
95 $self->{'filename'} = $filename;
96 $self->{'processor'} = $processor;
97 $self->{'metadata'} = $metadata;
98
99 eval {
100 $self->{'parser'}->parsefile($filename);
101 };
102 if ($@) {
103
104 # parsefile may either croak somewhere in XML::Parser (e.g. because
105 # the document is not well formed) or die somewhere in XMLPlug or a
106 # derived plugin (e.g. because we're attempting to process a
107 # document whose DOCTYPE is not meant for this plugin). For the
108 # first case we'll print a warning and continue, for the second
109 # we'll just continue quietly
110
111 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
112 if (defined $msg) {
113 my $outhandle = $self->{'outhandle'};
114 my $plugin_name = ref ($self);
115 print $outhandle "$plugin_name failed to process $file ($msg)\n";
116 }
117 # reset ourself for the next document
118 $self->{'section_level'}=0;
119 return undef;
120 }
121
122 return 1; # processed the file
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
128 return q^(?i)\.xml$^;
129}
130
131sub StartDocument {$self->xml_start_document(@_);}
132sub XMLDecl {$self->xml_xmldecl(@_);}
133sub Entity {$self->xml_entity(@_);}
134sub Doctype {$self->xml_doctype(@_);}
135sub StartTag {$self->xml_start_tag(@_);}
136sub EndTag {$self->xml_end_tag(@_);}
137sub Text {$self->xml_text(@_);}
138sub PI {$self->xml_pi(@_);}
139sub EndDocument {$self->xml_end_document(@_);}
140sub Default {$self->xml_default(@_);}
141
142# This Char function overrides the one in XML::Parser::Stream to overcome a
143# problem where $expat->{Text} is treated as the return value, slowing
144# things down significantly in some cases.
145sub Char {
146 $_[0]->{'Text'} .= $_[1];
147 return undef;
148}
149
150# Called at the beginning of the XML document.
151sub xml_start_document {
152 my $self = shift(@_);
153 my ($expat) = @_;
154
155 $self->open_document();
156}
157
158# Called for XML declarations
159sub xml_xmldecl {
160 my $self = shift(@_);
161 my ($expat, $version, $encoding, $standalone) = @_;
162}
163
164# Called for XML entities
165sub xml_entity {
166 my $self = shift(@_);
167 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
168}
169
170# Called for DOCTYPE declarations - use die to bail out if this doctype
171# is not meant for this plugin
172sub xml_doctype {
173 my $self = shift(@_);
174 my ($expat, $name, $sysid, $pubid, $internal) = @_;
175 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
176}
177
178# Called for every start tag. The $_ variable will contain a copy of the
179# tag and the %_ variable will contain the element's attributes.
180sub xml_start_tag {
181 my $self = shift(@_);
182 my ($expat, $element) = @_;
183}
184
185# Called for every end tag. The $_ variable will contain a copy of the tag.
186sub xml_end_tag {
187 my $self = shift(@_);
188 my ($expat, $element) = @_;
189}
190
191# Called just before start or end tags with accumulated non-markup text in
192# the $_ variable.
193sub xml_text {
194 my $self = shift(@_);
195 my ($expat) = @_;
196}
197
198# Called for processing instructions. The $_ variable will contain a copy
199# of the pi.
200sub xml_pi {
201 my $self = shift(@_);
202 my ($expat, $target, $data) = @_;
203}
204
205# Called at the end of the XML document.
206sub xml_end_document {
207 my $self = shift(@_);
208 my ($expat) = @_;
209
210 $self->close_document();
211}
212
213# Called for any characters not handled by the above functions.
214sub xml_default {
215 my $self = shift(@_);
216 my ($expat, $text) = @_;
217}
218
219sub open_document {
220 my $self = shift(@_);
221
222 # create a new document
223 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
224 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
225}
226
227sub close_document {
228 my $self = shift(@_);
229
230 # include any metadata passed in from previous plugins
231 # note that this metadata is associated with the top level section
232 $self->extra_metadata ($self->{'doc_obj'},
233 $self->{'doc_obj'}->get_top_section(),
234 $self->{'metadata'});
235
236 # do any automatic metadata extraction
237 $self->auto_extract_metadata ($self->{'doc_obj'});
238
239 # add an OID
240 $self->{'doc_obj'}->set_OID();
241
242 # process the document
243 $self->{'processor'}->process($self->{'doc_obj'});
244
245 $self->{'num_processed'} ++;
246}
247
2481;
249
Note: See TracBrowser for help on using the repository browser.