source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 9957

Last change on this file since 9957 was 9957, checked in by davidb, 19 years ago

Introduction of XSLT support to XML based plugins. This is done through
a minus option (-xslt) that allows the input XML to be transformed before it
is processed by the plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @XMLPlug::ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XMLParser;
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" },
44 { 'name' => "xslt",
45 'desc' => "{XMLPlug.xslt}",
46 'type' => "string",
47 'deft' => "",
48 'reqd' => "no" } ];
49
50my $options = { 'name' => "XMLPlug",
51 'desc' => "{XMLPlug.desc}",
52 'abstract' => "yes",
53 'inherits' => "yes",
54 'args' => $arguments };
55
56
57my ($self);
58sub new {
59 my $class = shift (@_);
60
61 # $self is global for use within subroutines called by XML::Parser
62 $self = new BasPlug ($class, @_);
63 $self->{'plugin_type'} = "XMLPlug";
64 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
65 my $option_list = $self->{'option_list'};
66 push( @{$option_list}, $options );
67
68 if (!parsargv::parse(\@_,
69 q^xslt/.*/^, \$self->{'xslt'},
70 "allow_extra_options")) {
71
72 print STDERR "\nIncorrect options passed to XSLTPlug, check your collect.cfg configuration file\n";
73 $self->print_txt_usage(""); # Use default resource bundle
74 die "\n";
75 }
76
77
78 my $parser = new XML::Parser('Style' => 'Stream',
79 'Handlers' => {'Char' => \&Char,
80 'XMLDecl' => \&XMLDecl,
81 'Entity' => \&Entity,
82 'Doctype' => \&Doctype,
83 'Default' => \&Default
84 });
85
86
87
88 $self->{'parser'} = $parser;
89
90 return bless $self, $class;
91}
92
93sub apply_xslt
94{
95 my $self = shift @_;
96 my ($xslt,$filename) = @_;
97
98 my $outhandle = $self->{'outhandle'};
99
100 my $xslt_filename = $xslt;
101
102 if (! -e $xslt_filename) {
103 # Look in main site directory
104 my $gsdlhome = $ENV{'GSDLHOME'};
105 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
106 }
107
108 if (! -e $xslt_filename) {
109 # Look in collection directory
110 my $coldir = $ENV{'COLLECTDIR'};
111 $xslt_filename = &util::filename_cat($coldir,$xslt);
112 }
113
114 if (! -e $xslt_filename) {
115 print $outhandle "Warning: Unable to find XSLT $xslt\n";
116 if (open(XMLIN,"<$filename")) {
117
118 $untransformed_xml = "";
119 while (defined (my $line = <XMLIN>)) {
120
121 $untransformed_xml .= $line;
122 }
123 close(XMLIN);
124
125 return $untransformed_xml;
126 }
127 else {
128 print $outhandle "Error: Unable to open file $filename\n";
129 print $outhandle " $!\n";
130 return "";
131 }
132
133 }
134
135 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
136 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
137 my $xslt_base_cmd = "java -jar $jar_filename";
138 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
139
140 my $transformed_xml = "";
141
142 if (open(XSLT_IN,"$xslt_cmd |")) {
143 while (defined (my $line = <XSLT_IN>)) {
144
145 $transformed_xml .= $line;
146 }
147 close(XSLT_IN);
148 }
149 else {
150 print $outhandle "Error: Unable to run command $xsl_cmd\n";
151 print $outhandle " $!\n";
152 }
153
154 return $transformed_xml;
155
156}
157
158
159sub read {
160 # $self must be global to work with XML callback routines.
161 $self = shift (@_);
162
163 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165 my $filename = $file;
166 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
167
168 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
169 $self->{'num_blocked'} ++;
170 return 0;
171 }
172 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
173 return undef;
174 }
175
176 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
177 $self->{'file'} = $file;
178 $self->{'filename'} = $filename;
179 $self->{'processor'} = $processor;
180 $self->{'metadata'} = $metadata;
181 $self->{'gli'} = $gli;
182
183 eval {
184 my $xslt = $self->{'xslt'};
185 if (defined $xslt && ($xslt ne "")) {
186 # perform xslt
187 my $transformed_xml = $self->apply_xslt($xslt,$filename);
188
189 # feed transformed file (now in memory as string) into XML parser
190 $self->{'parser'}->parse($transformed_xml);
191 }
192 else {
193 $self->{'parser'}->parsefile($filename);
194 }
195 };
196
197 if ($@) {
198
199 # parsefile may either croak somewhere in XML::Parser (e.g. because
200 # the document is not well formed) or die somewhere in XMLPlug or a
201 # derived plugin (e.g. because we're attempting to process a
202 # document whose DOCTYPE is not meant for this plugin). For the
203 # first case we'll print a warning and continue, for the second
204 # we'll just continue quietly
205
206 ## print STDERR "**** Error is: $@\n";
207
208 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
209 if (defined $msg) {
210 my $outhandle = $self->{'outhandle'};
211 my $plugin_name = ref ($self);
212 print $outhandle "$plugin_name failed to process $file ($msg)\n";
213 }
214
215 # reset ourself for the next document
216 $self->{'section_level'}=0;
217 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
218 return -1; # error during processing
219 }
220
221 return 1; # processed the file
222}
223
224sub get_default_process_exp {
225 my $self = shift (@_);
226
227 return q^(?i)\.xml$^;
228}
229
230sub StartDocument {$self->xml_start_document(@_);}
231sub XMLDecl {$self->xml_xmldecl(@_);}
232sub Entity {$self->xml_entity(@_);}
233sub Doctype {$self->xml_doctype(@_);}
234sub StartTag {$self->xml_start_tag(@_);}
235sub EndTag {$self->xml_end_tag(@_);}
236sub Text {$self->xml_text(@_);}
237sub PI {$self->xml_pi(@_);}
238sub EndDocument {$self->xml_end_document(@_);}
239sub Default {$self->xml_default(@_);}
240
241# This Char function overrides the one in XML::Parser::Stream to overcome a
242# problem where $expat->{Text} is treated as the return value, slowing
243# things down significantly in some cases.
244sub Char {
245 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
246 $_[0]->{'Text'} .= $_[1];
247 return undef;
248}
249
250# Called at the beginning of the XML document.
251sub xml_start_document {
252 my $self = shift(@_);
253 my ($expat) = @_;
254
255 $self->open_document();
256}
257
258# Called for XML declarations
259sub xml_xmldecl {
260 my $self = shift(@_);
261 my ($expat, $version, $encoding, $standalone) = @_;
262}
263
264# Called for XML entities
265sub xml_entity {
266 my $self = shift(@_);
267 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
268}
269
270# Called for DOCTYPE declarations - use die to bail out if this doctype
271# is not meant for this plugin
272sub xml_doctype {
273 my $self = shift(@_);
274 my ($expat, $name, $sysid, $pubid, $internal) = @_;
275 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
276}
277
278# Called for every start tag. The $_ variable will contain a copy of the
279# tag and the %_ variable will contain the element's attributes.
280sub xml_start_tag {
281 my $self = shift(@_);
282 my ($expat, $element) = @_;
283}
284
285# Called for every end tag. The $_ variable will contain a copy of the tag.
286sub xml_end_tag {
287 my $self = shift(@_);
288 my ($expat, $element) = @_;
289}
290
291# Called just before start or end tags with accumulated non-markup text in
292# the $_ variable.
293sub xml_text {
294 my $self = shift(@_);
295 my ($expat) = @_;
296}
297
298# Called for processing instructions. The $_ variable will contain a copy
299# of the pi.
300sub xml_pi {
301 my $self = shift(@_);
302 my ($expat, $target, $data) = @_;
303}
304
305# Called at the end of the XML document.
306sub xml_end_document {
307 my $self = shift(@_);
308 my ($expat) = @_;
309
310 $self->close_document();
311}
312
313# Called for any characters not handled by the above functions.
314sub xml_default {
315 my $self = shift(@_);
316 my ($expat, $text) = @_;
317}
318
319sub open_document {
320 my $self = shift(@_);
321
322 # create a new document
323 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
324 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
325}
326
327sub close_document {
328 my $self = shift(@_);
329 my $doc_obj = $self->{'doc_obj'};
330 # include any metadata passed in from previous plugins
331 # note that this metadata is associated with the top level section
332 $self->extra_metadata ($doc_obj,
333 $doc_obj->get_top_section(),
334 $self->{'metadata'});
335
336 # do any automatic metadata extraction
337 $self->auto_extract_metadata ($doc_obj);
338
339 # add an OID
340 $doc_obj->set_OID();
341
342 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
343 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
344
345 # process the document
346 $self->{'processor'}->process($doc_obj);
347
348 $self->{'num_processed'} ++;
349}
350
3511;
352
353
354
355
Note: See TracBrowser for help on using the repository browser.