source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30#$%^
31use parse2;
32
33sub BEGIN {
34 @XMLPlug::ISA = ('BasPlug');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "XMLPlug",
53 'desc' => "{XMLPlug.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58
59our ($self);
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
68 # $self is global for use within subroutines called by XML::Parser
69 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
70
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Handlers' => {'Char' => \&Char,
73 'XMLDecl' => \&XMLDecl,
74 'Entity' => \&Entity,
75 'Doctype' => \&Doctype,
76 'Default' => \&Default
77 });
78 $self->{'parser'} = $parser;
79
80 return bless $self, $class;
81}
82
83sub apply_xslt
84{
85 my $self = shift @_;
86 my ($xslt,$filename) = @_;
87
88 my $outhandle = $self->{'outhandle'};
89
90 my $xslt_filename = $xslt;
91
92 if (! -e $xslt_filename) {
93 # Look in main site directory
94 my $gsdlhome = $ENV{'GSDLHOME'};
95 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
96 }
97
98 if (! -e $xslt_filename) {
99 # Look in collection directory
100 my $coldir = $ENV{'COLLECTDIR'};
101 $xslt_filename = &util::filename_cat($coldir,$xslt);
102 }
103
104 if (! -e $xslt_filename) {
105 print $outhandle "Warning: Unable to find XSLT $xslt\n";
106 if (open(XMLIN,"<$filename")) {
107
108 $untransformed_xml = "";
109 while (defined (my $line = <XMLIN>)) {
110
111 $untransformed_xml .= $line;
112 }
113 close(XMLIN);
114
115 return $untransformed_xml;
116 }
117 else {
118 print $outhandle "Error: Unable to open file $filename\n";
119 print $outhandle " $!\n";
120 return "";
121 }
122
123 }
124
125 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
126 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
127 my $xslt_base_cmd = "java -jar $jar_filename";
128 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
129
130 my $transformed_xml = "";
131
132 if (open(XSLT_IN,"$xslt_cmd |")) {
133 while (defined (my $line = <XSLT_IN>)) {
134
135 $transformed_xml .= $line;
136 }
137 close(XSLT_IN);
138 }
139 else {
140 print $outhandle "Error: Unable to run command $xsl_cmd\n";
141 print $outhandle " $!\n";
142 }
143
144 return $transformed_xml;
145
146}
147
148
149sub read {
150 # $self must be global to work with XML callback routines.
151 $self = shift (@_);
152
153 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
154
155 my $filename = $file;
156 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
157
158 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
159 $self->{'num_blocked'} ++;
160 return 0;
161 }
162 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
163 return undef;
164 }
165
166 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
167 $self->{'file'} = $file;
168 $self->{'filename'} = $filename;
169 $self->{'processor'} = $processor;
170 $self->{'metadata'} = $metadata;
171 $self->{'gli'} = $gli;
172
173 eval {
174 my $xslt = $self->{'xslt'};
175 if (defined $xslt && ($xslt ne "")) {
176 # perform xslt
177 my $transformed_xml = $self->apply_xslt($xslt,$filename);
178
179 # feed transformed file (now in memory as string) into XML parser
180 $self->{'parser'}->parse($transformed_xml);
181 }
182 else {
183 $self->{'parser'}->parsefile($filename);
184 }
185 };
186
187 if ($@) {
188
189 # parsefile may either croak somewhere in XML::Parser (e.g. because
190 # the document is not well formed) or die somewhere in XMLPlug or a
191 # derived plugin (e.g. because we're attempting to process a
192 # document whose DOCTYPE is not meant for this plugin). For the
193 # first case we'll print a warning and continue, for the second
194 # we'll just continue quietly
195
196 print STDERR "**** Error is: $@\n";
197
198 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
199 if (defined $msg) {
200 my $outhandle = $self->{'outhandle'};
201 my $plugin_name = ref ($self);
202 print $outhandle "$plugin_name failed to process $file ($msg)\n";
203 }
204
205 # reset ourself for the next document
206 $self->{'section_level'}=0;
207 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
208 return -1; # error during processing
209 }
210
211 return 1; # processed the file
212}
213
214# the following two methods are for if you want to do the parsing from a
215# plugin that inherits from this. it seems that you can't call the parse
216# methods directly. WHY???
217sub parse_file {
218 $self = shift (@_);
219 my ($filename) = @_;
220 $self->{'parser'}->parsefile($filename);
221}
222
223sub parse_string {
224 $self = shift (@_);
225 my ($xml_string) = @_;
226 $self->{'parser'}->parse($xml_string);
227}
228
229sub get_default_process_exp {
230 my $self = shift (@_);
231
232 return q^(?i)\.xml$^;
233}
234
235sub StartDocument {$self->xml_start_document(@_);}
236sub XMLDecl {$self->xml_xmldecl(@_);}
237sub Entity {$self->xml_entity(@_);}
238sub Doctype {$self->xml_doctype(@_);}
239sub StartTag {$self->xml_start_tag(@_);}
240sub EndTag {$self->xml_end_tag(@_);}
241sub Text {$self->xml_text(@_);}
242sub PI {$self->xml_pi(@_);}
243sub EndDocument {$self->xml_end_document(@_);}
244sub Default {$self->xml_default(@_);}
245
246# This Char function overrides the one in XML::Parser::Stream to overcome a
247# problem where $expat->{Text} is treated as the return value, slowing
248# things down significantly in some cases.
249sub Char {
250 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
251 $_[0]->{'Text'} .= $_[1];
252 return undef;
253}
254
255# Called at the beginning of the XML document.
256sub xml_start_document {
257 my $self = shift(@_);
258 my ($expat) = @_;
259
260 $self->open_document();
261}
262
263# Called for XML declarations
264sub xml_xmldecl {
265 my $self = shift(@_);
266 my ($expat, $version, $encoding, $standalone) = @_;
267}
268
269# Called for XML entities
270sub xml_entity {
271 my $self = shift(@_);
272 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
273}
274
275# Called for DOCTYPE declarations - use die to bail out if this doctype
276# is not meant for this plugin
277sub xml_doctype {
278 my $self = shift(@_);
279 my ($expat, $name, $sysid, $pubid, $internal) = @_;
280 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
281}
282
283# Called for every start tag. The $_ variable will contain a copy of the
284# tag and the %_ variable will contain the element's attributes.
285sub xml_start_tag {
286 my $self = shift(@_);
287 my ($expat, $element) = @_;
288}
289
290# Called for every end tag. The $_ variable will contain a copy of the tag.
291sub xml_end_tag {
292 my $self = shift(@_);
293 my ($expat, $element) = @_;
294}
295
296# Called just before start or end tags with accumulated non-markup text in
297# the $_ variable.
298sub xml_text {
299 my $self = shift(@_);
300 my ($expat) = @_;
301}
302
303# Called for processing instructions. The $_ variable will contain a copy
304# of the pi.
305sub xml_pi {
306 my $self = shift(@_);
307 my ($expat, $target, $data) = @_;
308}
309
310# Called at the end of the XML document.
311sub xml_end_document {
312 my $self = shift(@_);
313 my ($expat) = @_;
314
315 $self->close_document();
316}
317
318# Called for any characters not handled by the above functions.
319sub xml_default {
320 my $self = shift(@_);
321 my ($expat, $text) = @_;
322}
323
324sub open_document {
325 my $self = shift(@_);
326
327 # create a new document
328 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
329 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'});
330}
331
332sub close_document {
333 my $self = shift(@_);
334 my $doc_obj = $self->{'doc_obj'};
335 # include any metadata passed in from previous plugins
336 # note that this metadata is associated with the top level section
337 $self->extra_metadata ($doc_obj,
338 $doc_obj->get_top_section(),
339 $self->{'metadata'});
340
341 # do any automatic metadata extraction
342 $self->auto_extract_metadata ($doc_obj);
343
344 # add an OID
345 $doc_obj->set_OID();
346
347 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
348 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
349
350 # process the document
351 $self->{'processor'}->process($doc_obj);
352
353 $self->{'num_processed'} ++;
354}
355
3561;
357
358
359
360
Note: See TracBrowser for help on using the repository browser.