source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 12969

Last change on this file since 12969 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @XMLPlug::ISA = ('BasPlug');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "XMLPlug",
53 'desc' => "{XMLPlug.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58
59our ($self);
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68 # $self is global for use within subroutines called by XML::Parser
69 $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
70
71 if ($self->{'info_only'}) {
72 # don't worry about any options etc
73 return bless $self, $class;
74 }
75
76 my $parser = new XML::Parser('Style' => 'Stream',
77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
79 'Entity' => \&Entity,
80 'Doctype' => \&Doctype,
81 'Default' => \&Default
82 });
83 $self->{'parser'} = $parser;
84
85 return bless $self, $class;
86}
87
88sub apply_xslt
89{
90 my $self = shift @_;
91 my ($xslt,$filename) = @_;
92
93 my $outhandle = $self->{'outhandle'};
94
95 my $xslt_filename = $xslt;
96
97 if (! -e $xslt_filename) {
98 # Look in main site directory
99 my $gsdlhome = $ENV{'GSDLHOME'};
100 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
101 }
102
103 if (! -e $xslt_filename) {
104 # Look in collection directory
105 my $coldir = $ENV{'GSDLCOLLECTDIR'};
106 $xslt_filename = &util::filename_cat($coldir,$xslt);
107 }
108
109 if (! -e $xslt_filename) {
110 print $outhandle "Warning: Unable to find XSLT $xslt\n";
111 if (open(XMLIN,"<$filename")) {
112
113 my $untransformed_xml = "";
114 while (defined (my $line = <XMLIN>)) {
115
116 $untransformed_xml .= $line;
117 }
118 close(XMLIN);
119
120 return $untransformed_xml;
121 }
122 else {
123 print $outhandle "Error: Unable to open file $filename\n";
124 print $outhandle " $!\n";
125 return "";
126 }
127
128 }
129
130 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
131 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
132 my $xslt_base_cmd = "java -jar $jar_filename";
133 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
134
135 my $transformed_xml = "";
136
137 if (open(XSLT_IN,"$xslt_cmd |")) {
138 while (defined (my $line = <XSLT_IN>)) {
139
140 $transformed_xml .= $line;
141 }
142 close(XSLT_IN);
143 }
144 else {
145 print $outhandle "Error: Unable to run command $xslt_cmd\n";
146 print $outhandle " $!\n";
147 }
148
149 return $transformed_xml;
150
151}
152
153
154sub read {
155 # $self must be global to work with XML callback routines.
156 $self = shift (@_);
157
158 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
159
160 # Make sure we're processing the correct file, do blocking etc
161 my ($block_status,$filename) = $self->read_block(@_);
162 return $block_status if ((!defined $block_status) || ($block_status==0));
163
164 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
165 $self->{'base_dir'} = $base_dir;
166 $self->{'file'} = $file;
167 $self->{'filename'} = $filename;
168 $self->{'processor'} = $processor;
169 $self->{'metadata'} = $metadata;
170
171 eval {
172 my $xslt = $self->{'xslt'};
173 if (defined $xslt && ($xslt ne "")) {
174 # perform xslt
175 my $transformed_xml = $self->apply_xslt($xslt,$filename);
176
177 # feed transformed file (now in memory as string) into XML parser
178 $self->{'parser'}->parse($transformed_xml);
179 }
180 else {
181 $self->{'parser'}->parsefile($filename);
182 }
183 };
184
185 if ($@) {
186
187 # parsefile may either croak somewhere in XML::Parser (e.g. because
188 # the document is not well formed) or die somewhere in XMLPlug or a
189 # derived plugin (e.g. because we're attempting to process a
190 # document whose DOCTYPE is not meant for this plugin). For the
191 # first case we'll print a warning and continue, for the second
192 # we'll just continue quietly
193
194 print STDERR "**** Error is: $@\n";
195
196 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
197 if (defined $msg) {
198 my $outhandle = $self->{'outhandle'};
199 my $plugin_name = ref ($self);
200 print $outhandle "$plugin_name failed to process $file ($msg)\n";
201 }
202
203 # reset ourself for the next document
204 $self->{'section_level'}=0;
205 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
206 return -1; # error during processing
207 }
208
209 return 1; # processed the file
210}
211
212# the following two methods are for if you want to do the parsing from a
213# plugin that inherits from this. it seems that you can't call the parse
214# methods directly. WHY???
215sub parse_file {
216 $self = shift (@_);
217 my ($filename) = @_;
218 $self->{'parser'}->parsefile($filename);
219}
220
221sub parse_string {
222 $self = shift (@_);
223 my ($xml_string) = @_;
224 $self->{'parser'}->parse($xml_string);
225}
226
227sub get_default_process_exp {
228 my $self = shift (@_);
229
230 return q^(?i)\.xml$^;
231}
232
233sub StartDocument {$self->xml_start_document(@_);}
234sub XMLDecl {$self->xml_xmldecl(@_);}
235sub Entity {$self->xml_entity(@_);}
236sub Doctype {$self->xml_doctype(@_);}
237sub StartTag {$self->xml_start_tag(@_);}
238sub EndTag {$self->xml_end_tag(@_);}
239sub Text {$self->xml_text(@_);}
240sub PI {$self->xml_pi(@_);}
241sub EndDocument {$self->xml_end_document(@_);}
242sub Default {$self->xml_default(@_);}
243
244# This Char function overrides the one in XML::Parser::Stream to overcome a
245# problem where $expat->{Text} is treated as the return value, slowing
246# things down significantly in some cases.
247sub Char {
248 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
249 $_[0]->{'Text'} .= $_[1];
250 return undef;
251}
252
253# Called at the beginning of the XML document.
254sub xml_start_document {
255 my $self = shift(@_);
256 my ($expat) = @_;
257
258 $self->open_document();
259}
260
261# Called for XML declarations
262sub xml_xmldecl {
263 my $self = shift(@_);
264 my ($expat, $version, $encoding, $standalone) = @_;
265}
266
267# Called for XML entities
268sub xml_entity {
269 my $self = shift(@_);
270 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
271}
272
273# Called for DOCTYPE declarations - use die to bail out if this doctype
274# is not meant for this plugin
275sub xml_doctype {
276 my $self = shift(@_);
277 my ($expat, $name, $sysid, $pubid, $internal) = @_;
278 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
279}
280
281# Called for every start tag. The $_ variable will contain a copy of the
282# tag and the %_ variable will contain the element's attributes.
283sub xml_start_tag {
284 my $self = shift(@_);
285 my ($expat, $element) = @_;
286}
287
288# Called for every end tag. The $_ variable will contain a copy of the tag.
289sub xml_end_tag {
290 my $self = shift(@_);
291 my ($expat, $element) = @_;
292}
293
294# Called just before start or end tags with accumulated non-markup text in
295# the $_ variable.
296sub xml_text {
297 my $self = shift(@_);
298 my ($expat) = @_;
299}
300
301# Called for processing instructions. The $_ variable will contain a copy
302# of the pi.
303sub xml_pi {
304 my $self = shift(@_);
305 my ($expat, $target, $data) = @_;
306}
307
308# Called at the end of the XML document.
309sub xml_end_document {
310 my $self = shift(@_);
311 my ($expat) = @_;
312
313 $self->close_document();
314}
315
316# Called for any characters not handled by the above functions.
317sub xml_default {
318 my $self = shift(@_);
319 my ($expat, $text) = @_;
320}
321
322sub open_document {
323 my $self = shift(@_);
324
325 # create a new document
326 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
327 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
328}
329
330sub close_document {
331 my $self = shift(@_);
332 my $doc_obj = $self->{'doc_obj'};
333 # include any metadata passed in from previous plugins
334 # note that this metadata is associated with the top level section
335 $self->extra_metadata ($doc_obj,
336 $doc_obj->get_top_section(),
337 $self->{'metadata'});
338
339 # do any automatic metadata extraction
340 $self->auto_extract_metadata ($doc_obj);
341
342 # add an OID
343 $doc_obj->set_OID();
344
345 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
346 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
347
348 # process the document
349 $self->{'processor'}->process($doc_obj);
350
351 $self->{'num_processed'} ++;
352}
353
3541;
355
356
357
358
Note: See TracBrowser for help on using the repository browser.