source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 11090

Last change on this file since 11090 was 11090, checked in by kjdon, 18 years ago

made all plugins that implement read() call read_block to check process_exp, block_exp, smart blocking, cover image blocking etc

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @XMLPlug::ISA = ('BasPlug');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "XMLPlug",
53 'desc' => "{XMLPlug.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58
59our ($self);
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68 # $self is global for use within subroutines called by XML::Parser
69 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
70
71 if ($self->{'info_only'}) {
72 # don't worry about any options etc
73 return bless $self, $class;
74 }
75
76 my $parser = new XML::Parser('Style' => 'Stream',
77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
79 'Entity' => \&Entity,
80 'Doctype' => \&Doctype,
81 'Default' => \&Default
82 });
83 $self->{'parser'} = $parser;
84
85 return bless $self, $class;
86}
87
88sub apply_xslt
89{
90 my $self = shift @_;
91 my ($xslt,$filename) = @_;
92
93 my $outhandle = $self->{'outhandle'};
94
95 my $xslt_filename = $xslt;
96
97 if (! -e $xslt_filename) {
98 # Look in main site directory
99 my $gsdlhome = $ENV{'GSDLHOME'};
100 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
101 }
102
103 if (! -e $xslt_filename) {
104 # Look in collection directory
105 my $coldir = $ENV{'COLLECTDIR'};
106 $xslt_filename = &util::filename_cat($coldir,$xslt);
107 }
108
109 if (! -e $xslt_filename) {
110 print $outhandle "Warning: Unable to find XSLT $xslt\n";
111 if (open(XMLIN,"<$filename")) {
112
113 my $untransformed_xml = "";
114 while (defined (my $line = <XMLIN>)) {
115
116 $untransformed_xml .= $line;
117 }
118 close(XMLIN);
119
120 return $untransformed_xml;
121 }
122 else {
123 print $outhandle "Error: Unable to open file $filename\n";
124 print $outhandle " $!\n";
125 return "";
126 }
127
128 }
129
130 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
131 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
132 my $xslt_base_cmd = "java -jar $jar_filename";
133 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
134
135 my $transformed_xml = "";
136
137 if (open(XSLT_IN,"$xslt_cmd |")) {
138 while (defined (my $line = <XSLT_IN>)) {
139
140 $transformed_xml .= $line;
141 }
142 close(XSLT_IN);
143 }
144 else {
145 print $outhandle "Error: Unable to run command $xslt_cmd\n";
146 print $outhandle " $!\n";
147 }
148
149 return $transformed_xml;
150
151}
152
153
154sub read {
155 # $self must be global to work with XML callback routines.
156 $self = shift (@_);
157
158 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
159
160 # Make sure we're processing the correct file, do blocking etc
161 my ($block_status,$filename) = $self->read_block(@_);
162 return $block_status if ((!defined $block_status) || ($block_status==0));
163
164 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
165 $self->{'file'} = $file;
166 $self->{'filename'} = $filename;
167 $self->{'processor'} = $processor;
168 $self->{'metadata'} = $metadata;
169 $self->{'gli'} = $gli;
170
171 eval {
172 my $xslt = $self->{'xslt'};
173 if (defined $xslt && ($xslt ne "")) {
174 # perform xslt
175 my $transformed_xml = $self->apply_xslt($xslt,$filename);
176
177 # feed transformed file (now in memory as string) into XML parser
178 $self->{'parser'}->parse($transformed_xml);
179 }
180 else {
181 $self->{'parser'}->parsefile($filename);
182 }
183 };
184
185 if ($@) {
186
187 # parsefile may either croak somewhere in XML::Parser (e.g. because
188 # the document is not well formed) or die somewhere in XMLPlug or a
189 # derived plugin (e.g. because we're attempting to process a
190 # document whose DOCTYPE is not meant for this plugin). For the
191 # first case we'll print a warning and continue, for the second
192 # we'll just continue quietly
193
194 print STDERR "**** Error is: $@\n";
195
196 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
197 if (defined $msg) {
198 my $outhandle = $self->{'outhandle'};
199 my $plugin_name = ref ($self);
200 print $outhandle "$plugin_name failed to process $file ($msg)\n";
201 }
202
203 # reset ourself for the next document
204 $self->{'section_level'}=0;
205 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
206 return -1; # error during processing
207 }
208
209 return 1; # processed the file
210}
211
212# the following two methods are for if you want to do the parsing from a
213# plugin that inherits from this. it seems that you can't call the parse
214# methods directly. WHY???
215sub parse_file {
216 $self = shift (@_);
217 my ($filename) = @_;
218 $self->{'parser'}->parsefile($filename);
219}
220
221sub parse_string {
222 $self = shift (@_);
223 my ($xml_string) = @_;
224 $self->{'parser'}->parse($xml_string);
225}
226
227sub get_default_process_exp {
228 my $self = shift (@_);
229
230 return q^(?i)\.xml$^;
231}
232
233sub StartDocument {$self->xml_start_document(@_);}
234sub XMLDecl {$self->xml_xmldecl(@_);}
235sub Entity {$self->xml_entity(@_);}
236sub Doctype {$self->xml_doctype(@_);}
237sub StartTag {$self->xml_start_tag(@_);}
238sub EndTag {$self->xml_end_tag(@_);}
239sub Text {$self->xml_text(@_);}
240sub PI {$self->xml_pi(@_);}
241sub EndDocument {$self->xml_end_document(@_);}
242sub Default {$self->xml_default(@_);}
243
244# This Char function overrides the one in XML::Parser::Stream to overcome a
245# problem where $expat->{Text} is treated as the return value, slowing
246# things down significantly in some cases.
247sub Char {
248 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
249 $_[0]->{'Text'} .= $_[1];
250 return undef;
251}
252
253# Called at the beginning of the XML document.
254sub xml_start_document {
255 my $self = shift(@_);
256 my ($expat) = @_;
257
258 $self->open_document();
259}
260
261# Called for XML declarations
262sub xml_xmldecl {
263 my $self = shift(@_);
264 my ($expat, $version, $encoding, $standalone) = @_;
265}
266
267# Called for XML entities
268sub xml_entity {
269 my $self = shift(@_);
270 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
271}
272
273# Called for DOCTYPE declarations - use die to bail out if this doctype
274# is not meant for this plugin
275sub xml_doctype {
276 my $self = shift(@_);
277 my ($expat, $name, $sysid, $pubid, $internal) = @_;
278 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
279}
280
281# Called for every start tag. The $_ variable will contain a copy of the
282# tag and the %_ variable will contain the element's attributes.
283sub xml_start_tag {
284 my $self = shift(@_);
285 my ($expat, $element) = @_;
286}
287
288# Called for every end tag. The $_ variable will contain a copy of the tag.
289sub xml_end_tag {
290 my $self = shift(@_);
291 my ($expat, $element) = @_;
292}
293
294# Called just before start or end tags with accumulated non-markup text in
295# the $_ variable.
296sub xml_text {
297 my $self = shift(@_);
298 my ($expat) = @_;
299}
300
301# Called for processing instructions. The $_ variable will contain a copy
302# of the pi.
303sub xml_pi {
304 my $self = shift(@_);
305 my ($expat, $target, $data) = @_;
306}
307
308# Called at the end of the XML document.
309sub xml_end_document {
310 my $self = shift(@_);
311 my ($expat) = @_;
312
313 $self->close_document();
314}
315
316# Called for any characters not handled by the above functions.
317sub xml_default {
318 my $self = shift(@_);
319 my ($expat, $text) = @_;
320}
321
322sub open_document {
323 my $self = shift(@_);
324
325 # create a new document
326 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
327 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
328}
329
330sub close_document {
331 my $self = shift(@_);
332 my $doc_obj = $self->{'doc_obj'};
333 # include any metadata passed in from previous plugins
334 # note that this metadata is associated with the top level section
335 $self->extra_metadata ($doc_obj,
336 $doc_obj->get_top_section(),
337 $self->{'metadata'});
338
339 # do any automatic metadata extraction
340 $self->auto_extract_metadata ($doc_obj);
341
342 # add an OID
343 $doc_obj->set_OID();
344
345 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
346 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
347
348 # process the document
349 $self->{'processor'}->process($doc_obj);
350
351 $self->{'num_processed'} ++;
352}
353
3541;
355
356
357
358
Note: See TracBrowser for help on using the repository browser.