source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 11333

Last change on this file since 11333 was 11333, checked in by mdewsnip, 18 years ago

Now consistently sets $self->{'gli'} in plugin::begin.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @XMLPlug::ISA = ('BasPlug');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "XMLPlug",
53 'desc' => "{XMLPlug.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58
59our ($self);
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68 # $self is global for use within subroutines called by XML::Parser
69 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
70
71 if ($self->{'info_only'}) {
72 # don't worry about any options etc
73 return bless $self, $class;
74 }
75
76 my $parser = new XML::Parser('Style' => 'Stream',
77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
79 'Entity' => \&Entity,
80 'Doctype' => \&Doctype,
81 'Default' => \&Default
82 });
83 $self->{'parser'} = $parser;
84
85 return bless $self, $class;
86}
87
88sub apply_xslt
89{
90 my $self = shift @_;
91 my ($xslt,$filename) = @_;
92
93 my $outhandle = $self->{'outhandle'};
94
95 my $xslt_filename = $xslt;
96
97 if (! -e $xslt_filename) {
98 # Look in main site directory
99 my $gsdlhome = $ENV{'GSDLHOME'};
100 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
101 }
102
103 if (! -e $xslt_filename) {
104 # Look in collection directory
105 my $coldir = $ENV{'COLLECTDIR'};
106 $xslt_filename = &util::filename_cat($coldir,$xslt);
107 }
108
109 if (! -e $xslt_filename) {
110 print $outhandle "Warning: Unable to find XSLT $xslt\n";
111 if (open(XMLIN,"<$filename")) {
112
113 my $untransformed_xml = "";
114 while (defined (my $line = <XMLIN>)) {
115
116 $untransformed_xml .= $line;
117 }
118 close(XMLIN);
119
120 return $untransformed_xml;
121 }
122 else {
123 print $outhandle "Error: Unable to open file $filename\n";
124 print $outhandle " $!\n";
125 return "";
126 }
127
128 }
129
130 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
131 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
132 my $xslt_base_cmd = "java -jar $jar_filename";
133 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
134
135 my $transformed_xml = "";
136
137 if (open(XSLT_IN,"$xslt_cmd |")) {
138 while (defined (my $line = <XSLT_IN>)) {
139
140 $transformed_xml .= $line;
141 }
142 close(XSLT_IN);
143 }
144 else {
145 print $outhandle "Error: Unable to run command $xslt_cmd\n";
146 print $outhandle " $!\n";
147 }
148
149 return $transformed_xml;
150
151}
152
153
154sub read {
155 # $self must be global to work with XML callback routines.
156 $self = shift (@_);
157
158 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
159
160 # Make sure we're processing the correct file, do blocking etc
161 my ($block_status,$filename) = $self->read_block(@_);
162 return $block_status if ((!defined $block_status) || ($block_status==0));
163
164 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
165 $self->{'file'} = $file;
166 $self->{'filename'} = $filename;
167 $self->{'processor'} = $processor;
168 $self->{'metadata'} = $metadata;
169
170 eval {
171 my $xslt = $self->{'xslt'};
172 if (defined $xslt && ($xslt ne "")) {
173 # perform xslt
174 my $transformed_xml = $self->apply_xslt($xslt,$filename);
175
176 # feed transformed file (now in memory as string) into XML parser
177 $self->{'parser'}->parse($transformed_xml);
178 }
179 else {
180 $self->{'parser'}->parsefile($filename);
181 }
182 };
183
184 if ($@) {
185
186 # parsefile may either croak somewhere in XML::Parser (e.g. because
187 # the document is not well formed) or die somewhere in XMLPlug or a
188 # derived plugin (e.g. because we're attempting to process a
189 # document whose DOCTYPE is not meant for this plugin). For the
190 # first case we'll print a warning and continue, for the second
191 # we'll just continue quietly
192
193 print STDERR "**** Error is: $@\n";
194
195 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
196 if (defined $msg) {
197 my $outhandle = $self->{'outhandle'};
198 my $plugin_name = ref ($self);
199 print $outhandle "$plugin_name failed to process $file ($msg)\n";
200 }
201
202 # reset ourself for the next document
203 $self->{'section_level'}=0;
204 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
205 return -1; # error during processing
206 }
207
208 return 1; # processed the file
209}
210
211# the following two methods are for if you want to do the parsing from a
212# plugin that inherits from this. it seems that you can't call the parse
213# methods directly. WHY???
214sub parse_file {
215 $self = shift (@_);
216 my ($filename) = @_;
217 $self->{'parser'}->parsefile($filename);
218}
219
220sub parse_string {
221 $self = shift (@_);
222 my ($xml_string) = @_;
223 $self->{'parser'}->parse($xml_string);
224}
225
226sub get_default_process_exp {
227 my $self = shift (@_);
228
229 return q^(?i)\.xml$^;
230}
231
232sub StartDocument {$self->xml_start_document(@_);}
233sub XMLDecl {$self->xml_xmldecl(@_);}
234sub Entity {$self->xml_entity(@_);}
235sub Doctype {$self->xml_doctype(@_);}
236sub StartTag {$self->xml_start_tag(@_);}
237sub EndTag {$self->xml_end_tag(@_);}
238sub Text {$self->xml_text(@_);}
239sub PI {$self->xml_pi(@_);}
240sub EndDocument {$self->xml_end_document(@_);}
241sub Default {$self->xml_default(@_);}
242
243# This Char function overrides the one in XML::Parser::Stream to overcome a
244# problem where $expat->{Text} is treated as the return value, slowing
245# things down significantly in some cases.
246sub Char {
247 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
248 $_[0]->{'Text'} .= $_[1];
249 return undef;
250}
251
252# Called at the beginning of the XML document.
253sub xml_start_document {
254 my $self = shift(@_);
255 my ($expat) = @_;
256
257 $self->open_document();
258}
259
260# Called for XML declarations
261sub xml_xmldecl {
262 my $self = shift(@_);
263 my ($expat, $version, $encoding, $standalone) = @_;
264}
265
266# Called for XML entities
267sub xml_entity {
268 my $self = shift(@_);
269 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
270}
271
272# Called for DOCTYPE declarations - use die to bail out if this doctype
273# is not meant for this plugin
274sub xml_doctype {
275 my $self = shift(@_);
276 my ($expat, $name, $sysid, $pubid, $internal) = @_;
277 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
278}
279
280# Called for every start tag. The $_ variable will contain a copy of the
281# tag and the %_ variable will contain the element's attributes.
282sub xml_start_tag {
283 my $self = shift(@_);
284 my ($expat, $element) = @_;
285}
286
287# Called for every end tag. The $_ variable will contain a copy of the tag.
288sub xml_end_tag {
289 my $self = shift(@_);
290 my ($expat, $element) = @_;
291}
292
293# Called just before start or end tags with accumulated non-markup text in
294# the $_ variable.
295sub xml_text {
296 my $self = shift(@_);
297 my ($expat) = @_;
298}
299
300# Called for processing instructions. The $_ variable will contain a copy
301# of the pi.
302sub xml_pi {
303 my $self = shift(@_);
304 my ($expat, $target, $data) = @_;
305}
306
307# Called at the end of the XML document.
308sub xml_end_document {
309 my $self = shift(@_);
310 my ($expat) = @_;
311
312 $self->close_document();
313}
314
315# Called for any characters not handled by the above functions.
316sub xml_default {
317 my $self = shift(@_);
318 my ($expat, $text) = @_;
319}
320
321sub open_document {
322 my $self = shift(@_);
323
324 # create a new document
325 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
326 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
327}
328
329sub close_document {
330 my $self = shift(@_);
331 my $doc_obj = $self->{'doc_obj'};
332 # include any metadata passed in from previous plugins
333 # note that this metadata is associated with the top level section
334 $self->extra_metadata ($doc_obj,
335 $doc_obj->get_top_section(),
336 $self->{'metadata'});
337
338 # do any automatic metadata extraction
339 $self->auto_extract_metadata ($doc_obj);
340
341 # add an OID
342 $doc_obj->set_OID();
343
344 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
345 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
346
347 # process the document
348 $self->{'processor'}->process($doc_obj);
349
350 $self->{'num_processed'} ++;
351}
352
3531;
354
355
356
357
Note: See TracBrowser for help on using the repository browser.