source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 10170

Last change on this file since 10170 was 10170, checked in by kjdon, 19 years ago

made our, added two parse methods - if you want to do xml parsing from a subclass, can't call the parser directly

  • Property svn:keywords set to Author Date Id Revision
File size: 9.7 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30
31sub BEGIN {
32 @XMLPlug::ISA = ('BasPlug');
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34}
35
36use XMLParser;
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'deft' => &get_default_process_exp(),
43 'reqd' => "no" },
44 { 'name' => "xslt",
45 'desc' => "{XMLPlug.xslt}",
46 'type' => "string",
47 'deft' => "",
48 'reqd' => "no" } ];
49
50my $options = { 'name' => "XMLPlug",
51 'desc' => "{XMLPlug.desc}",
52 'abstract' => "yes",
53 'inherits' => "yes",
54 'args' => $arguments };
55
56
57our ($self);
58sub new {
59 my $class = shift (@_);
60
61 # $self is global for use within subroutines called by XML::Parser
62 $self = new BasPlug ($class, @_);
63 $self->{'plugin_type'} = "XMLPlug";
64 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
65 my $option_list = $self->{'option_list'};
66 push( @{$option_list}, $options );
67
68 if (!parsargv::parse(\@_,
69 q^xslt/.*/^, \$self->{'xslt'},
70 "allow_extra_options")) {
71
72 print STDERR "\nIncorrect options passed to XSLTPlug, check your collect.cfg configuration file\n";
73 $self->print_txt_usage(""); # Use default resource bundle
74 die "\n";
75 }
76
77
78 my $parser = new XML::Parser('Style' => 'Stream',
79 'Handlers' => {'Char' => \&Char,
80 'XMLDecl' => \&XMLDecl,
81 'Entity' => \&Entity,
82 'Doctype' => \&Doctype,
83 'Default' => \&Default
84 });
85
86
87
88 $self->{'parser'} = $parser;
89
90 return bless $self, $class;
91}
92
93sub apply_xslt
94{
95 my $self = shift @_;
96 my ($xslt,$filename) = @_;
97
98 my $outhandle = $self->{'outhandle'};
99
100 my $xslt_filename = $xslt;
101
102 if (! -e $xslt_filename) {
103 # Look in main site directory
104 my $gsdlhome = $ENV{'GSDLHOME'};
105 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
106 }
107
108 if (! -e $xslt_filename) {
109 # Look in collection directory
110 my $coldir = $ENV{'COLLECTDIR'};
111 $xslt_filename = &util::filename_cat($coldir,$xslt);
112 }
113
114 if (! -e $xslt_filename) {
115 print $outhandle "Warning: Unable to find XSLT $xslt\n";
116 if (open(XMLIN,"<$filename")) {
117
118 $untransformed_xml = "";
119 while (defined (my $line = <XMLIN>)) {
120
121 $untransformed_xml .= $line;
122 }
123 close(XMLIN);
124
125 return $untransformed_xml;
126 }
127 else {
128 print $outhandle "Error: Unable to open file $filename\n";
129 print $outhandle " $!\n";
130 return "";
131 }
132
133 }
134
135 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
136 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
137 my $xslt_base_cmd = "java -jar $jar_filename";
138 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
139
140 my $transformed_xml = "";
141
142 if (open(XSLT_IN,"$xslt_cmd |")) {
143 while (defined (my $line = <XSLT_IN>)) {
144
145 $transformed_xml .= $line;
146 }
147 close(XSLT_IN);
148 }
149 else {
150 print $outhandle "Error: Unable to run command $xsl_cmd\n";
151 print $outhandle " $!\n";
152 }
153
154 return $transformed_xml;
155
156}
157
158
159sub read {
160 # $self must be global to work with XML callback routines.
161 $self = shift (@_);
162
163 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165 my $filename = $file;
166 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
167
168 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
169 $self->{'num_blocked'} ++;
170 return 0;
171 }
172 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
173 return undef;
174 }
175
176 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
177 $self->{'file'} = $file;
178 $self->{'filename'} = $filename;
179 $self->{'processor'} = $processor;
180 $self->{'metadata'} = $metadata;
181 $self->{'gli'} = $gli;
182
183 eval {
184 my $xslt = $self->{'xslt'};
185 if (defined $xslt && ($xslt ne "")) {
186 # perform xslt
187 my $transformed_xml = $self->apply_xslt($xslt,$filename);
188
189 # feed transformed file (now in memory as string) into XML parser
190 $self->{'parser'}->parse($transformed_xml);
191 }
192 else {
193 $self->{'parser'}->parsefile($filename);
194 }
195 };
196
197 if ($@) {
198
199 # parsefile may either croak somewhere in XML::Parser (e.g. because
200 # the document is not well formed) or die somewhere in XMLPlug or a
201 # derived plugin (e.g. because we're attempting to process a
202 # document whose DOCTYPE is not meant for this plugin). For the
203 # first case we'll print a warning and continue, for the second
204 # we'll just continue quietly
205
206 print STDERR "**** Error is: $@\n";
207
208 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
209 if (defined $msg) {
210 my $outhandle = $self->{'outhandle'};
211 my $plugin_name = ref ($self);
212 print $outhandle "$plugin_name failed to process $file ($msg)\n";
213 }
214
215 # reset ourself for the next document
216 $self->{'section_level'}=0;
217 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
218 return -1; # error during processing
219 }
220
221 return 1; # processed the file
222}
223
224# the following two methods are for if you want to do the parsing from a
225# plugin that inherits from this. it seems that you can't call the parse
226# methods directly. WHY???
227sub parse_file {
228 $self = shift (@_);
229 my ($filename) = @_;
230 $self->{'parser'}->parsefile($filename);
231}
232
233sub parse_string {
234 $self = shift (@_);
235 my ($xml_string) = @_;
236 $self->{'parser'}->parse($xml_string);
237}
238
239sub get_default_process_exp {
240 my $self = shift (@_);
241
242 return q^(?i)\.xml$^;
243}
244
245sub StartDocument {$self->xml_start_document(@_);}
246sub XMLDecl {$self->xml_xmldecl(@_);}
247sub Entity {$self->xml_entity(@_);}
248sub Doctype {$self->xml_doctype(@_);}
249sub StartTag {$self->xml_start_tag(@_);}
250sub EndTag {$self->xml_end_tag(@_);}
251sub Text {$self->xml_text(@_);}
252sub PI {$self->xml_pi(@_);}
253sub EndDocument {$self->xml_end_document(@_);}
254sub Default {$self->xml_default(@_);}
255
256# This Char function overrides the one in XML::Parser::Stream to overcome a
257# problem where $expat->{Text} is treated as the return value, slowing
258# things down significantly in some cases.
259sub Char {
260 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
261 $_[0]->{'Text'} .= $_[1];
262 return undef;
263}
264
265# Called at the beginning of the XML document.
266sub xml_start_document {
267 my $self = shift(@_);
268 my ($expat) = @_;
269
270 $self->open_document();
271}
272
273# Called for XML declarations
274sub xml_xmldecl {
275 my $self = shift(@_);
276 my ($expat, $version, $encoding, $standalone) = @_;
277}
278
279# Called for XML entities
280sub xml_entity {
281 my $self = shift(@_);
282 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
283}
284
285# Called for DOCTYPE declarations - use die to bail out if this doctype
286# is not meant for this plugin
287sub xml_doctype {
288 my $self = shift(@_);
289 my ($expat, $name, $sysid, $pubid, $internal) = @_;
290 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
291}
292
293# Called for every start tag. The $_ variable will contain a copy of the
294# tag and the %_ variable will contain the element's attributes.
295sub xml_start_tag {
296 my $self = shift(@_);
297 my ($expat, $element) = @_;
298}
299
300# Called for every end tag. The $_ variable will contain a copy of the tag.
301sub xml_end_tag {
302 my $self = shift(@_);
303 my ($expat, $element) = @_;
304}
305
306# Called just before start or end tags with accumulated non-markup text in
307# the $_ variable.
308sub xml_text {
309 my $self = shift(@_);
310 my ($expat) = @_;
311}
312
313# Called for processing instructions. The $_ variable will contain a copy
314# of the pi.
315sub xml_pi {
316 my $self = shift(@_);
317 my ($expat, $target, $data) = @_;
318}
319
320# Called at the end of the XML document.
321sub xml_end_document {
322 my $self = shift(@_);
323 my ($expat) = @_;
324
325 $self->close_document();
326}
327
328# Called for any characters not handled by the above functions.
329sub xml_default {
330 my $self = shift(@_);
331 my ($expat, $text) = @_;
332}
333
334sub open_document {
335 my $self = shift(@_);
336
337 # create a new document
338 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
339 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
340}
341
342sub close_document {
343 my $self = shift(@_);
344 my $doc_obj = $self->{'doc_obj'};
345 # include any metadata passed in from previous plugins
346 # note that this metadata is associated with the top level section
347 $self->extra_metadata ($doc_obj,
348 $doc_obj->get_top_section(),
349 $self->{'metadata'});
350
351 # do any automatic metadata extraction
352 $self->auto_extract_metadata ($doc_obj);
353
354 # add an OID
355 $doc_obj->set_OID();
356
357 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
358 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
359
360 # process the document
361 $self->{'processor'}->process($doc_obj);
362
363 $self->{'num_processed'} ++;
364}
365
3661;
367
368
369
370
Note: See TracBrowser for help on using the repository browser.