source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @XMLPlug::ISA = ('BasPlug');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "XMLPlug",
53 'desc' => "{XMLPlug.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58
59our ($self);
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68 # $self is global for use within subroutines called by XML::Parser
69 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
70
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Handlers' => {'Char' => \&Char,
73 'XMLDecl' => \&XMLDecl,
74 'Entity' => \&Entity,
75 'Doctype' => \&Doctype,
76 'Default' => \&Default
77 });
78 $self->{'parser'} = $parser;
79
80 return bless $self, $class;
81}
82
83sub apply_xslt
84{
85 my $self = shift @_;
86 my ($xslt,$filename) = @_;
87
88 my $outhandle = $self->{'outhandle'};
89
90 my $xslt_filename = $xslt;
91
92 if (! -e $xslt_filename) {
93 # Look in main site directory
94 my $gsdlhome = $ENV{'GSDLHOME'};
95 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
96 }
97
98 if (! -e $xslt_filename) {
99 # Look in collection directory
100 my $coldir = $ENV{'COLLECTDIR'};
101 $xslt_filename = &util::filename_cat($coldir,$xslt);
102 }
103
104 if (! -e $xslt_filename) {
105 print $outhandle "Warning: Unable to find XSLT $xslt\n";
106 if (open(XMLIN,"<$filename")) {
107
108 my $untransformed_xml = "";
109 while (defined (my $line = <XMLIN>)) {
110
111 $untransformed_xml .= $line;
112 }
113 close(XMLIN);
114
115 return $untransformed_xml;
116 }
117 else {
118 print $outhandle "Error: Unable to open file $filename\n";
119 print $outhandle " $!\n";
120 return "";
121 }
122
123 }
124
125 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
126 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
127 my $xslt_base_cmd = "java -jar $jar_filename";
128 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
129
130 my $transformed_xml = "";
131
132 if (open(XSLT_IN,"$xslt_cmd |")) {
133 while (defined (my $line = <XSLT_IN>)) {
134
135 $transformed_xml .= $line;
136 }
137 close(XSLT_IN);
138 }
139 else {
140 print $outhandle "Error: Unable to run command $xslt_cmd\n";
141 print $outhandle " $!\n";
142 }
143
144 return $transformed_xml;
145
146}
147
148
149sub read {
150 # $self must be global to work with XML callback routines.
151 $self = shift (@_);
152
153 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
154
155 my $filename = $file;
156 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
157
158 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
159 $self->{'num_blocked'} ++;
160 return 0;
161 }
162 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
163 return undef;
164 }
165
166 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
167 $self->{'file'} = $file;
168 $self->{'filename'} = $filename;
169 $self->{'processor'} = $processor;
170 $self->{'metadata'} = $metadata;
171 $self->{'gli'} = $gli;
172
173 eval {
174 my $xslt = $self->{'xslt'};
175 if (defined $xslt && ($xslt ne "")) {
176 # perform xslt
177 my $transformed_xml = $self->apply_xslt($xslt,$filename);
178
179 # feed transformed file (now in memory as string) into XML parser
180 $self->{'parser'}->parse($transformed_xml);
181 }
182 else {
183 $self->{'parser'}->parsefile($filename);
184 }
185 };
186
187 if ($@) {
188
189 # parsefile may either croak somewhere in XML::Parser (e.g. because
190 # the document is not well formed) or die somewhere in XMLPlug or a
191 # derived plugin (e.g. because we're attempting to process a
192 # document whose DOCTYPE is not meant for this plugin). For the
193 # first case we'll print a warning and continue, for the second
194 # we'll just continue quietly
195
196 print STDERR "**** Error is: $@\n";
197
198 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
199 if (defined $msg) {
200 my $outhandle = $self->{'outhandle'};
201 my $plugin_name = ref ($self);
202 print $outhandle "$plugin_name failed to process $file ($msg)\n";
203 }
204
205 # reset ourself for the next document
206 $self->{'section_level'}=0;
207 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
208 return -1; # error during processing
209 }
210
211 return 1; # processed the file
212}
213
214# the following two methods are for if you want to do the parsing from a
215# plugin that inherits from this. it seems that you can't call the parse
216# methods directly. WHY???
217sub parse_file {
218 $self = shift (@_);
219 my ($filename) = @_;
220 $self->{'parser'}->parsefile($filename);
221}
222
223sub parse_string {
224 $self = shift (@_);
225 my ($xml_string) = @_;
226 $self->{'parser'}->parse($xml_string);
227}
228
229sub get_default_process_exp {
230 my $self = shift (@_);
231
232 return q^(?i)\.xml$^;
233}
234
235sub StartDocument {$self->xml_start_document(@_);}
236sub XMLDecl {$self->xml_xmldecl(@_);}
237sub Entity {$self->xml_entity(@_);}
238sub Doctype {$self->xml_doctype(@_);}
239sub StartTag {$self->xml_start_tag(@_);}
240sub EndTag {$self->xml_end_tag(@_);}
241sub Text {$self->xml_text(@_);}
242sub PI {$self->xml_pi(@_);}
243sub EndDocument {$self->xml_end_document(@_);}
244sub Default {$self->xml_default(@_);}
245
246# This Char function overrides the one in XML::Parser::Stream to overcome a
247# problem where $expat->{Text} is treated as the return value, slowing
248# things down significantly in some cases.
249sub Char {
250 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
251 $_[0]->{'Text'} .= $_[1];
252 return undef;
253}
254
255# Called at the beginning of the XML document.
256sub xml_start_document {
257 my $self = shift(@_);
258 my ($expat) = @_;
259
260 $self->open_document();
261}
262
263# Called for XML declarations
264sub xml_xmldecl {
265 my $self = shift(@_);
266 my ($expat, $version, $encoding, $standalone) = @_;
267}
268
269# Called for XML entities
270sub xml_entity {
271 my $self = shift(@_);
272 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
273}
274
275# Called for DOCTYPE declarations - use die to bail out if this doctype
276# is not meant for this plugin
277sub xml_doctype {
278 my $self = shift(@_);
279 my ($expat, $name, $sysid, $pubid, $internal) = @_;
280 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
281}
282
283# Called for every start tag. The $_ variable will contain a copy of the
284# tag and the %_ variable will contain the element's attributes.
285sub xml_start_tag {
286 my $self = shift(@_);
287 my ($expat, $element) = @_;
288}
289
290# Called for every end tag. The $_ variable will contain a copy of the tag.
291sub xml_end_tag {
292 my $self = shift(@_);
293 my ($expat, $element) = @_;
294}
295
296# Called just before start or end tags with accumulated non-markup text in
297# the $_ variable.
298sub xml_text {
299 my $self = shift(@_);
300 my ($expat) = @_;
301}
302
303# Called for processing instructions. The $_ variable will contain a copy
304# of the pi.
305sub xml_pi {
306 my $self = shift(@_);
307 my ($expat, $target, $data) = @_;
308}
309
310# Called at the end of the XML document.
311sub xml_end_document {
312 my $self = shift(@_);
313 my ($expat) = @_;
314
315 $self->close_document();
316}
317
318# Called for any characters not handled by the above functions.
319sub xml_default {
320 my $self = shift(@_);
321 my ($expat, $text) = @_;
322}
323
324sub open_document {
325 my $self = shift(@_);
326
327 # create a new document
328 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
329 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
330}
331
332sub close_document {
333 my $self = shift(@_);
334 my $doc_obj = $self->{'doc_obj'};
335 # include any metadata passed in from previous plugins
336 # note that this metadata is associated with the top level section
337 $self->extra_metadata ($doc_obj,
338 $doc_obj->get_top_section(),
339 $self->{'metadata'});
340
341 # do any automatic metadata extraction
342 $self->auto_extract_metadata ($doc_obj);
343
344 # add an OID
345 $doc_obj->set_OID();
346
347 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
348 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
349
350 # process the document
351 $self->{'processor'}->process($doc_obj);
352
353 $self->{'num_processed'} ++;
354}
355
3561;
357
358
359
360
Note: See TracBrowser for help on using the repository browser.