root/gsdl/trunk/perllib/plugins/ReadXMLFile.pm @ 18327

Revision 18327, 11.2 KB (checked in by ak19, 11 years ago)

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34    @ReadXMLFile::ISA = ('BasePlugin');
35    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41    [ { 'name' => "process_exp",
42    'desc' => "{BasePlugin.process_exp}",
43    'type' => "regexp",
44    'deft' => &get_default_process_exp(),
45    'reqd' => "no" },
46      { 'name' => "xslt",
47    'desc' => "{ReadXMLFile.xslt}",
48    'type' => "string",
49    'deft' => "",
50    'reqd' => "no" } ];
51
52my $options = { 'name'     => "ReadXMLFile",
53        'desc'     => "{ReadXMLFile.desc}",
54        'abstract' => "yes",
55        'inherits' => "yes",
56        'args'     => $arguments };
57
58sub new {
59    my ($class) = shift (@_);
60    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61    push(@$pluginlist, $class);
62
63    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64    push(@{$hashArgOptLists->{"OptList"}},$options);
65   
66    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
67
68    if ($self->{'info_only'}) {
69    # don't worry about creating the XML parser as all we want is the
70    # list of plugin options
71    return bless $self, $class;
72    }
73
74    my $parser = new XML::Parser('Style' => 'Stream',
75                                 'Pkg' => 'ReadXMLFile',
76                                 'PluginObj' => $self,
77                 'Namespaces' => 1, # strip out namespaces
78                 'Handlers' => {'Char' => \&Char,
79                        'XMLDecl' => \&XMLDecl,
80                        'Entity'  => \&Entity,
81                        'Doctype' => \&Doctype,
82                        'Default' => \&Default
83                                 });
84
85    $self->{'parser'} = $parser;
86
87    return bless $self, $class;
88}
89
90# the inheriting class must implement this method to tell whether to parse this doc type
91sub get_doctype {
92    my $self = shift(@_);
93    die "$self The inheriting class must implement get_doctype method";
94}
95
96
97sub apply_xslt
98{
99    my $self = shift @_;
100    my ($xslt,$filename) = @_;
101   
102    my $outhandle = $self->{'outhandle'};
103
104    my $xslt_filename = $xslt;
105
106    if (! -e $xslt_filename) {
107    # Look in main site directory
108    my $gsdlhome = $ENV{'GSDLHOME'};
109    $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
110    }
111
112    if (! -e $xslt_filename) {
113    # Look in collection directory
114    my $coldir = $ENV{'GSDLCOLLECTDIR'};
115    $xslt_filename = &util::filename_cat($coldir,$xslt);
116    }
117
118    if (! -e $xslt_filename) {
119    print $outhandle "Warning: Unable to find XSLT $xslt\n";
120    if (open(XMLIN,"<$filename")) {
121
122        my $untransformed_xml = "";
123        while (defined (my $line = <XMLIN>)) {
124
125        $untransformed_xml .= $line;
126        }
127        close(XMLIN);
128       
129        return $untransformed_xml;
130    }
131    else {
132        print $outhandle "Error: Unable to open file $filename\n";
133        print $outhandle "       $!\n";
134        return "";
135    }
136   
137    }
138
139    my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
140    my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
141    my $xslt_base_cmd = "java -jar $jar_filename";
142    my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
143
144    my $transformed_xml = "";
145
146    if (open(XSLT_IN,"$xslt_cmd |")) {
147    while (defined (my $line = <XSLT_IN>)) {
148
149        $transformed_xml .= $line;
150    }
151    close(XSLT_IN);
152    }
153    else {
154    print $outhandle "Error: Unable to run command $xslt_cmd\n";
155    print $outhandle "       $!\n";
156    }
157
158    return $transformed_xml;
159
160}
161
162sub can_process_this_file {
163    my $self = shift(@_);
164    my ($filename) = @_;
165
166    if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
167    return 1; # its a file for us
168    }
169    return 0;
170}
171
172sub check_doctype {
173    my $self = shift (@_);
174   
175    my ($filename) = @_;
176
177    if (open(XMLIN,"<$filename")) {
178    my $doctype = $self->get_doctype();
179    ## check whether the doctype has the same name as the root element tag
180    while (defined (my $line = <XMLIN>)) {
181        ## find the root element
182        if ($line =~ /<([\w\d:]+)[\s>]/){
183        my $root = $1;
184        if ($root !~ $doctype){
185            close(XMLIN);
186            return 0;
187        }
188        else {
189            close(XMLIN);
190            return 1;
191        }
192        }
193    }
194    close(XMLIN);
195    }
196   
197    return undef; # haven't found a valid line
198   
199}
200
201sub read {
202    my $self = shift (@_); 
203 
204    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
205
206    # can we process this file??
207    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
208    return undef unless $self->can_process_this_file($filename_full_path);
209   
210    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
211    $self->{'base_dir'} = $base_dir;
212    $self->{'file'} = $file;
213    $self->{'filename'} = $filename_full_path;
214    $self->{'processor'} = $processor;
215    # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
216    # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
217    $self->{'metadata'} = $metadata;
218
219    if ($self->parse_file($filename_full_path)) {
220    return 1; # processed the file
221    }
222    return -1;
223}
224
225
226sub parse_file {
227    my $self = shift (@_);
228    my ($filename_full_path, $file, $gli) = @_;
229    eval {
230    my $xslt = $self->{'xslt'};
231    if (defined $xslt && ($xslt ne "")) {
232        # perform xslt
233        my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
234
235        # feed transformed file (now in memory as string) into XML parser
236        $self->{'parser'}->parse($transformed_xml);
237    }
238    else {
239        $self->{'parser'}->parsefile($filename_full_path);
240    }
241    };
242 
243    if ($@) {
244
245    # parsefile may either croak somewhere in XML::Parser (e.g. because
246    # the document is not well formed) or die somewhere in ReadXMLFile or a
247    # derived plugin (e.g. because we're attempting to process a
248    # document whose DOCTYPE is not meant for this plugin). For the
249    # first case we'll print a warning and continue, for the second
250    # we'll just continue quietly
251
252    print STDERR "**** Error is: $@\n";
253
254    my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
255    if (defined $msg) {
256        my $outhandle = $self->{'outhandle'};
257        my $plugin_name = ref ($self);
258        print $outhandle "$plugin_name failed to process $file ($msg)\n";
259    }
260
261    # reset ourself for the next document
262    $self->{'section_level'}=0;
263    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
264    return -1; # error during processing
265    }
266    return 1; # parsing was successful
267}
268
269sub get_default_process_exp {
270    my $self = shift (@_);
271
272    return q^(?i)\.xml$^;
273}
274
275sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
276sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
277sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
278sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
279sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
280sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
281sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
282sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
283sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
284sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
285
286# This Char function overrides the one in XML::Parser::Stream to overcome a
287# problem where $expat->{Text} is treated as the return value, slowing
288# things down significantly in some cases.
289sub Char {
290    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
291    $_[0]->{'Text'} .= $_[1];
292    return undef;
293}
294
295# Called at the beginning of the XML document.
296sub xml_start_document {
297    my $self = shift(@_);
298    my ($expat) = @_;
299
300    $self->open_document();
301}
302
303# Called for XML declarations
304sub xml_xmldecl {
305    my $self = shift(@_);
306    my ($expat, $version, $encoding, $standalone) = @_;
307}
308
309# Called for XML entities
310sub xml_entity {
311  my $self = shift(@_);
312  my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
313}
314
315# Called for DOCTYPE declarations - use die to bail out if this doctype
316# is not meant for this plugin
317sub xml_doctype {
318    my $self = shift(@_);
319
320    my ($expat, $name, $sysid, $pubid, $internal) = @_;
321    die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
322}
323
324
325# Called for every start tag. The $_ variable will contain a copy of the
326# tag and the %_ variable will contain the element's attributes.
327sub xml_start_tag {
328    my $self = shift(@_);
329    my ($expat, $element) = @_;
330}
331
332# Called for every end tag. The $_ variable will contain a copy of the tag.
333sub xml_end_tag {
334    my $self = shift(@_);
335    my ($expat, $element) = @_;
336}
337
338# Called just before start or end tags with accumulated non-markup text in
339# the $_ variable.
340sub xml_text {
341    my $self = shift(@_);
342    my ($expat) = @_;
343}
344
345# Called for processing instructions. The $_ variable will contain a copy
346# of the pi.
347sub xml_pi {
348    my $self = shift(@_);
349    my ($expat, $target, $data) = @_;
350}
351
352# Called at the end of the XML document.
353sub xml_end_document {
354    my $self = shift(@_);
355    my ($expat) = @_;
356
357    $self->close_document();
358}
359
360# Called for any characters not handled by the above functions.
361sub xml_default {
362    my $self = shift(@_);
363    my ($expat, $text) = @_;
364}
365
366sub open_document {
367    my $self = shift(@_);
368
369    # create a new document
370    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
371    $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
372
373    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
374}
375
376sub close_document {
377    my $self = shift(@_);
378    my $doc_obj = $self->{'doc_obj'};
379
380    # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
381
382    # include any metadata passed in from previous plugins
383    # note that this metadata is associated with the top level section
384    $self->extra_metadata ($doc_obj,
385               $doc_obj->get_top_section(),
386               $self->{'metadata'});
387   
388    # do any automatic metadata extraction
389    $self->auto_extract_metadata ($doc_obj);
390   
391    # add an OID
392    $self->add_OID($doc_obj);
393   
394    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
395    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
396
397    # process the document
398    $self->{'processor'}->process($doc_obj);
399   
400    $self->{'num_processed'} ++;
401    undef $self->{'doc_obj'};
402    undef $doc_obj; # is this the same as above??
403}
404
4051;
406
407
408
409
Note: See TracBrowser for help on using the browser.