root/trunk/gsdl/perllib/plugins/XMLPlug.pm @ 14103

Revision 14103, 11.8 KB (checked in by sjboddie, 12 years ago)

Fixed up the way XMLPlug is implemented, so it no longer relies on $self
being a global variable. This change should not make any functional
difference, but will allow inheritance to work properly.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34    @XMLPlug::ISA = ('BasPlug');
35    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41    [ { 'name' => "process_exp",
42    'desc' => "{BasPlug.process_exp}",
43    'type' => "regexp",
44    'deft' => &get_default_process_exp(),
45    'reqd' => "no" },
46      { 'name' => "xslt",
47    'desc' => "{XMLPlug.xslt}",
48    'type' => "string",
49    'deft' => "",
50    'reqd' => "no" } ];
51
52my $options = { 'name'     => "XMLPlug",
53        'desc'     => "{XMLPlug.desc}",
54        'abstract' => "yes",
55        'inherits' => "yes",
56        'args'     => $arguments };
57
58sub new {
59    my ($class) = shift (@_);
60    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61    push(@$pluginlist, $class);
62
63    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
64    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
65   
66    # $self is global for use within subroutines called by XML::Parser
67    my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
68
69    if ($self->{'info_only'}) {
70    # don't worry about any options etc
71    return bless $self, $class;
72    }
73
74    my $parser = new XML::Parser('Style' => 'Stream',
75                                 'Pkg' => 'XMLPlug',
76                                 'PluginObj' => $self,
77                 'Handlers' => {'Char' => \&Char,
78                        'XMLDecl' => \&XMLDecl,
79                        'Entity' => \&Entity,
80                        'Doctype' => \&Doctype,
81                        'Default' => \&Default,
82                                 });
83
84    $self->{'parser'} = $parser;
85
86    return bless $self, $class;
87}
88
89# the inheriting class must implement this method to tell whether to parse this doc type
90sub get_doctype {
91    my $self = shift(@_);
92    die "$self The inheriting class must implement get_doctype method";
93}
94
95
96sub apply_xslt
97{
98    my $self = shift @_;
99    my ($xslt,$filename) = @_;
100   
101    my $outhandle = $self->{'outhandle'};
102
103    my $xslt_filename = $xslt;
104
105    if (! -e $xslt_filename) {
106    # Look in main site directory
107    my $gsdlhome = $ENV{'GSDLHOME'};
108    $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
109    }
110
111    if (! -e $xslt_filename) {
112    # Look in collection directory
113    my $coldir = $ENV{'GSDLCOLLECTDIR'};
114    $xslt_filename = &util::filename_cat($coldir,$xslt);
115    }
116
117    if (! -e $xslt_filename) {
118    print $outhandle "Warning: Unable to find XSLT $xslt\n";
119    if (open(XMLIN,"<$filename")) {
120
121        my $untransformed_xml = "";
122        while (defined (my $line = <XMLIN>)) {
123
124        $untransformed_xml .= $line;
125        }
126        close(XMLIN);
127       
128        return $untransformed_xml;
129    }
130    else {
131        print $outhandle "Error: Unable to open file $filename\n";
132        print $outhandle "       $!\n";
133        return "";
134    }
135   
136    }
137
138    my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
139    my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
140    my $xslt_base_cmd = "java -jar $jar_filename";
141    my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
142
143    my $transformed_xml = "";
144
145    if (open(XSLT_IN,"$xslt_cmd |")) {
146    while (defined (my $line = <XSLT_IN>)) {
147
148        $transformed_xml .= $line;
149    }
150    close(XSLT_IN);
151    }
152    else {
153    print $outhandle "Error: Unable to run command $xslt_cmd\n";
154    print $outhandle "       $!\n";
155    }
156
157    return $transformed_xml;
158
159}
160
161sub check_doctype {
162    my $self = shift (@_);
163   
164    my ($filename) = @_;
165
166    if (open(XMLIN,"<$filename")) {
167    my $doctype = $self->get_doctype();
168    ## check whether the doctype has the same name as the root element tag
169    while (defined (my $line = <XMLIN>)) {
170        ## find the root element
171        if ($line =~ /<([\w\d:]+)[\s>]/){
172        my $root = $1;
173        if ($root !~ $doctype){
174            close(XMLIN);
175            return 0;
176        }
177        else {
178            close(XMLIN);
179            return 1;
180        }
181        }
182    }
183    close(XMLIN);
184    }
185   
186    return undef; # haven't found a valid line
187   
188}
189
190# because we are not just using process_exp to determine whether to process or not, we need to implement this too, so that a file can be passed down if we are not actually processing it
191sub metadata_read {
192    my $self = shift (@_);
193   
194    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
195 
196    my $result = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
197
198    if (defined $result) {
199    # we think we are processing this, but check that we actually are
200    my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
201
202    if ($self->check_doctype($filename)) {
203        return $result;
204    }
205    }
206    return undef;
207}
208
209sub read {
210    my $self = shift (@_); 
211 
212    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
213
214    # Make sure we're processing the correct file, do blocking etc
215    my ($block_status,$filename) = $self->read_block(@_);   
216    return $block_status if ((!defined $block_status) || ($block_status==0));
217
218    ## check the doctype to see whether we really want to process the file
219    if (!$self->check_doctype($filename)) {
220    # this file is not for us
221    return undef;
222    }
223
224    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
225    $self->{'base_dir'} = $base_dir;
226    $self->{'file'} = $file;
227    $self->{'filename'} = $filename;
228    $self->{'processor'} = $processor;
229    $self->{'metadata'} = $metadata;
230
231    eval {
232    my $xslt = $self->{'xslt'};
233    if (defined $xslt && ($xslt ne "")) {
234        # perform xslt
235        my $transformed_xml = $self->apply_xslt($xslt,$filename);
236
237        # feed transformed file (now in memory as string) into XML parser
238        $self->{'parser'}->parse($transformed_xml);
239    }
240    else {
241        $self->{'parser'}->parsefile($filename);
242    }
243    };
244 
245    if ($@) {
246
247    # parsefile may either croak somewhere in XML::Parser (e.g. because
248    # the document is not well formed) or die somewhere in XMLPlug or a
249    # derived plugin (e.g. because we're attempting to process a
250    # document whose DOCTYPE is not meant for this plugin). For the
251    # first case we'll print a warning and continue, for the second
252    # we'll just continue quietly
253
254    print STDERR "**** Error is: $@\n";
255
256    my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
257    if (defined $msg) {
258        my $outhandle = $self->{'outhandle'};
259        my $plugin_name = ref ($self);
260        print $outhandle "$plugin_name failed to process $file ($msg)\n";
261    }
262
263    # reset ourself for the next document
264    $self->{'section_level'}=0;
265    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
266    return -1; # error during processing
267    }
268
269   
270    return 1; # processed the file
271}
272
273# the following two methods are for if you want to do the parsing from a
274# plugin that inherits from this. it seems that you can't call the parse
275# methods directly. WHY???
276#
277# [Stefan 27/5/07] These two methods may not be necessary any more as I've
278# fixed XMLPlug so $self is no longer required to be a global variable
279# (that was why inheritance wasn't working quite right with XMLPlug I
280# think). I don't really know what other plugins rely on these methods
281# though so have left them here for now.
282sub parse_file {
283    my $self = shift (@_);
284    my ($filename) = @_;
285    $self->{'parser'}->parsefile($filename);
286}
287
288sub parse_string {
289    my $self = shift (@_);
290    my ($xml_string) = @_;
291    $self->{'parser'}->parse($xml_string);
292}
293
294sub get_default_process_exp {
295    my $self = shift (@_);
296
297    return q^(?i)\.xml$^;
298}
299
300sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
301sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
302sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
303sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
304sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
305sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
306sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
307sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
308sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
309sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
310
311# This Char function overrides the one in XML::Parser::Stream to overcome a
312# problem where $expat->{Text} is treated as the return value, slowing
313# things down significantly in some cases.
314sub Char {
315    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
316    $_[0]->{'Text'} .= $_[1];
317    return undef;
318}
319
320# Called at the beginning of the XML document.
321sub xml_start_document {
322    my $self = shift(@_);
323    my ($expat) = @_;
324
325    $self->open_document();
326}
327
328# Called for XML declarations
329sub xml_xmldecl {
330    my $self = shift(@_);
331    my ($expat, $version, $encoding, $standalone) = @_;
332}
333
334# Called for XML entities
335sub xml_entity {
336  my $self = shift(@_);
337  my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
338}
339
340# Called for DOCTYPE declarations - use die to bail out if this doctype
341# is not meant for this plugin
342sub xml_doctype {
343    my $self = shift(@_);
344
345    my ($expat, $name, $sysid, $pubid, $internal) = @_;
346    die "XMLPlug Cannot process XML document with DOCTYPE of $name";
347}
348
349
350# Called for every start tag. The $_ variable will contain a copy of the
351# tag and the %_ variable will contain the element's attributes.
352sub xml_start_tag {
353    my $self = shift(@_);
354    my ($expat, $element) = @_;
355}
356
357# Called for every end tag. The $_ variable will contain a copy of the tag.
358sub xml_end_tag {
359    my $self = shift(@_);
360    my ($expat, $element) = @_;
361}
362
363# Called just before start or end tags with accumulated non-markup text in
364# the $_ variable.
365sub xml_text {
366    my $self = shift(@_);
367    my ($expat) = @_;
368}
369
370# Called for processing instructions. The $_ variable will contain a copy
371# of the pi.
372sub xml_pi {
373    my $self = shift(@_);
374    my ($expat, $target, $data) = @_;
375}
376
377# Called at the end of the XML document.
378sub xml_end_document {
379    my $self = shift(@_);
380    my ($expat) = @_;
381
382    $self->close_document();
383}
384
385# Called for any characters not handled by the above functions.
386sub xml_default {
387    my $self = shift(@_);
388    my ($expat, $text) = @_;
389}
390
391sub open_document {
392    my $self = shift(@_);
393
394    # create a new document
395    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
396    $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
397}
398
399sub close_document {
400    my $self = shift(@_);
401    my $doc_obj = $self->{'doc_obj'};
402    # include any metadata passed in from previous plugins
403    # note that this metadata is associated with the top level section
404    $self->extra_metadata ($doc_obj,
405               $doc_obj->get_top_section(),
406               $self->{'metadata'});
407   
408    # do any automatic metadata extraction
409    $self->auto_extract_metadata ($doc_obj);
410   
411    # add an OID
412    $doc_obj->set_OID();
413   
414    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
415    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
416
417    # process the document
418    $self->{'processor'}->process($doc_obj);
419   
420    $self->{'num_processed'} ++;
421}
422
4231;
424
425
426
427
Note: See TracBrowser for help on using the browser.