root/trunk/gsdl/perllib/plugins/XMLPlug.pm @ 13226

Revision 13226, 11.3 KB (checked in by shaoqun, 13 years ago)

should allow element names with digits

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34    @XMLPlug::ISA = ('BasPlug');
35    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41    [ { 'name' => "process_exp",
42    'desc' => "{BasPlug.process_exp}",
43    'type' => "regexp",
44    'deft' => &get_default_process_exp(),
45    'reqd' => "no" },
46      { 'name' => "xslt",
47    'desc' => "{XMLPlug.xslt}",
48    'type' => "string",
49    'deft' => "",
50    'reqd' => "no" } ];
51
52my $options = { 'name'     => "XMLPlug",
53        'desc'     => "{XMLPlug.desc}",
54        'abstract' => "yes",
55        'inherits' => "yes",
56        'args'     => $arguments };
57
58
59our ($self);
60sub new {
61    my ($class) = shift (@_);
62    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63    push(@$pluginlist, $class);
64
65    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67   
68    # $self is global for use within subroutines called by XML::Parser
69    $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
70
71    if ($self->{'info_only'}) {
72    # don't worry about any options etc
73    return bless $self, $class;
74    }
75
76    my $parser = new XML::Parser('Style' => 'Stream',
77                 'Handlers' => {'Char' => \&Char,
78                        'XMLDecl' => \&XMLDecl,
79                        'Entity' => \&Entity,
80                        'Doctype' => \&Doctype,
81                        'Default' => \&Default,
82                        }); 
83
84    $self->{'parser'} = $parser;
85
86    return bless $self, $class;
87}
88
89# the inheriting class must implement this method to tell whether to parse this doc type
90sub get_doctype {
91    my $self = shift(@_);
92    die "$self The inheriting class must implement get_doctype method";
93}
94
95
96sub apply_xslt
97{
98    my $self = shift @_;
99    my ($xslt,$filename) = @_;
100   
101    my $outhandle = $self->{'outhandle'};
102
103    my $xslt_filename = $xslt;
104
105    if (! -e $xslt_filename) {
106    # Look in main site directory
107    my $gsdlhome = $ENV{'GSDLHOME'};
108    $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
109    }
110
111    if (! -e $xslt_filename) {
112    # Look in collection directory
113    my $coldir = $ENV{'GSDLCOLLECTDIR'};
114    $xslt_filename = &util::filename_cat($coldir,$xslt);
115    }
116
117    if (! -e $xslt_filename) {
118    print $outhandle "Warning: Unable to find XSLT $xslt\n";
119    if (open(XMLIN,"<$filename")) {
120
121        my $untransformed_xml = "";
122        while (defined (my $line = <XMLIN>)) {
123
124        $untransformed_xml .= $line;
125        }
126        close(XMLIN);
127       
128        return $untransformed_xml;
129    }
130    else {
131        print $outhandle "Error: Unable to open file $filename\n";
132        print $outhandle "       $!\n";
133        return "";
134    }
135   
136    }
137
138    my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
139    my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
140    my $xslt_base_cmd = "java -jar $jar_filename";
141    my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
142
143    my $transformed_xml = "";
144
145    if (open(XSLT_IN,"$xslt_cmd |")) {
146    while (defined (my $line = <XSLT_IN>)) {
147
148        $transformed_xml .= $line;
149    }
150    close(XSLT_IN);
151    }
152    else {
153    print $outhandle "Error: Unable to run command $xslt_cmd\n";
154    print $outhandle "       $!\n";
155    }
156
157    return $transformed_xml;
158
159}
160
161sub check_doctype {
162    $self = shift (@_);
163   
164    my ($filename) = @_;
165   
166    if (open(XMLIN,"<$filename")) {
167    my $doctype = $self->get_doctype();
168    ## check whether the doctype has the same name as the root element tag
169    while (defined (my $line = <XMLIN>)) {
170        ## find the root element
171        if ($line =~ /<([\w\d:]+)[\s>]/){
172        my $root = $1;
173        if ($root !~ $doctype){
174            close(XMLIN);
175            return 0;
176        }
177        else {
178            close(XMLIN);
179            return 1;
180        }
181        }
182    }
183    close(XMLIN);
184    }
185   
186    return undef; # haven't found a valid line
187   
188}
189
190# because we are not just using process_exp to determine whether to process or not, we need to implement this too, so that a file can be passed down if we are not actually processing it
191sub metadata_read {
192    $self = shift (@_);
193   
194    my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
195 
196    my $result = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
197
198    if (defined $result) {
199    # we think we are processing this, but check that we actually are
200    my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
201
202    if ($self->check_doctype($filename)) {
203        return $result;
204    }
205    }
206    return undef;
207}
208
209sub read {
210    # $self must be global to work with XML callback routines.
211    $self = shift (@_); 
212 
213    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
214
215    # Make sure we're processing the correct file, do blocking etc
216    my ($block_status,$filename) = $self->read_block(@_);   
217    return $block_status if ((!defined $block_status) || ($block_status==0));
218
219    ## check the doctype to see whether we really want to process the file
220    if (!$self->check_doctype($filename)) {
221    # this file is not for us
222    return undef;
223    }
224
225    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
226    $self->{'base_dir'} = $base_dir;
227    $self->{'file'} = $file;
228    $self->{'filename'} = $filename;
229    $self->{'processor'} = $processor;
230    $self->{'metadata'} = $metadata;
231
232    eval {
233    my $xslt = $self->{'xslt'};
234    if (defined $xslt && ($xslt ne "")) {
235        # perform xslt
236        my $transformed_xml = $self->apply_xslt($xslt,$filename);
237
238        # feed transformed file (now in memory as string) into XML parser
239        $self->{'parser'}->parse($transformed_xml);
240    }
241    else {
242        $self->{'parser'}->parsefile($filename);
243    }
244    };
245 
246    if ($@) {
247
248    # parsefile may either croak somewhere in XML::Parser (e.g. because
249    # the document is not well formed) or die somewhere in XMLPlug or a
250    # derived plugin (e.g. because we're attempting to process a
251    # document whose DOCTYPE is not meant for this plugin). For the
252    # first case we'll print a warning and continue, for the second
253    # we'll just continue quietly
254
255    print STDERR "**** Error is: $@\n";
256
257    my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
258    if (defined $msg) {
259        my $outhandle = $self->{'outhandle'};
260        my $plugin_name = ref ($self);
261        print $outhandle "$plugin_name failed to process $file ($msg)\n";
262    }
263
264    # reset ourself for the next document
265    $self->{'section_level'}=0;
266    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
267    return -1; # error during processing
268    }
269
270   
271    return 1; # processed the file
272}
273
274# the following two methods are for if you want to do the parsing from a
275# plugin that inherits from this. it seems that you can't call the parse
276# methods directly. WHY???
277sub parse_file {
278    $self = shift (@_);
279    my ($filename) = @_;
280    $self->{'parser'}->parsefile($filename);
281}
282
283sub parse_string {
284    $self = shift (@_);
285    my ($xml_string) = @_;
286    $self->{'parser'}->parse($xml_string);
287}
288
289sub get_default_process_exp {
290    my $self = shift (@_);
291
292    return q^(?i)\.xml$^;
293}
294
295sub StartDocument {$self->xml_start_document(@_);}
296sub XMLDecl {$self->xml_xmldecl(@_);}
297sub Entity {$self->xml_entity(@_);}
298sub Doctype {$self->xml_doctype(@_);}
299sub StartTag {$self->xml_start_tag(@_);}
300sub EndTag {$self->xml_end_tag(@_);}
301sub Text {$self->xml_text(@_);}
302sub PI {$self->xml_pi(@_);}
303sub EndDocument {$self->xml_end_document(@_);}
304sub Default {$self->xml_default(@_);}
305
306# This Char function overrides the one in XML::Parser::Stream to overcome a
307# problem where $expat->{Text} is treated as the return value, slowing
308# things down significantly in some cases.
309sub Char {
310    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
311    $_[0]->{'Text'} .= $_[1];
312    return undef;
313}
314
315# Called at the beginning of the XML document.
316sub xml_start_document {
317    my $self = shift(@_);
318    my ($expat) = @_;
319
320    $self->open_document();
321}
322
323# Called for XML declarations
324sub xml_xmldecl {
325    my $self = shift(@_);
326    my ($expat, $version, $encoding, $standalone) = @_;
327}
328
329# Called for XML entities
330sub xml_entity {
331  my $self = shift(@_);
332  my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
333}
334
335# Called for DOCTYPE declarations - use die to bail out if this doctype
336# is not meant for this plugin
337sub xml_doctype {
338    my $self = shift(@_);
339
340    my ($expat, $name, $sysid, $pubid, $internal) = @_;
341    die "XMLPlug Cannot process XML document with DOCTYPE of $name";
342}
343
344
345# Called for every start tag. The $_ variable will contain a copy of the
346# tag and the %_ variable will contain the element's attributes.
347sub xml_start_tag {
348    my $self = shift(@_);
349    my ($expat, $element) = @_;
350}
351
352# Called for every end tag. The $_ variable will contain a copy of the tag.
353sub xml_end_tag {
354    my $self = shift(@_);
355    my ($expat, $element) = @_;
356}
357
358# Called just before start or end tags with accumulated non-markup text in
359# the $_ variable.
360sub xml_text {
361    my $self = shift(@_);
362    my ($expat) = @_;
363}
364
365# Called for processing instructions. The $_ variable will contain a copy
366# of the pi.
367sub xml_pi {
368    my $self = shift(@_);
369    my ($expat, $target, $data) = @_;
370}
371
372# Called at the end of the XML document.
373sub xml_end_document {
374    my $self = shift(@_);
375    my ($expat) = @_;
376
377    $self->close_document();
378}
379
380# Called for any characters not handled by the above functions.
381sub xml_default {
382    my $self = shift(@_);
383    my ($expat, $text) = @_;
384}
385
386sub open_document {
387    my $self = shift(@_);
388
389    # create a new document
390    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
391    $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
392}
393
394sub close_document {
395    my $self = shift(@_);
396    my $doc_obj = $self->{'doc_obj'};
397    # include any metadata passed in from previous plugins
398    # note that this metadata is associated with the top level section
399    $self->extra_metadata ($doc_obj,
400               $doc_obj->get_top_section(),
401               $self->{'metadata'});
402   
403    # do any automatic metadata extraction
404    $self->auto_extract_metadata ($doc_obj);
405   
406    # add an OID
407    $doc_obj->set_OID();
408   
409    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
410    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
411
412    # process the document
413    $self->{'processor'}->process($doc_obj);
414   
415    $self->{'num_processed'} ++;
416}
417
4181;
419
420
421
422
Note: See TracBrowser for help on using the browser.