source: main/trunk/greenstone2/perllib/plugins/ReadXMLFile.pm@ 24348

Last change on this file since 24348 was 24348, checked in by davidb, 13 years ago

Better line wrapping for 80 col display

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
RevLine 
[2810]1###########################################################################
2#
[15871]3# ReadXMLFile.pm -- base class for XML plugins
[2810]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[15871]26package ReadXMLFile;
[2810]27
[15871]28use BasePlugin;
[2810]29use doc;
[10254]30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
[2810]32
33sub BEGIN {
[15871]34 @ReadXMLFile::ISA = ('BasePlugin');
[2810]35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
[8069]38use XMLParser;
[2810]39
[4744]40my $arguments =
41 [ { 'name' => "process_exp",
[15871]42 'desc' => "{BasePlugin.process_exp}",
[6408]43 'type' => "regexp",
[4744]44 'deft' => &get_default_process_exp(),
[9957]45 'reqd' => "no" },
46 { 'name' => "xslt",
[15871]47 'desc' => "{ReadXMLFile.xslt}",
[9957]48 'type' => "string",
49 'deft' => "",
[4744]50 'reqd' => "no" } ];
51
[15871]52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
[7244]54 'abstract' => "yes",
[4744]55 'inherits' => "yes",
56 'args' => $arguments };
[3540]57
[2810]58sub new {
[10218]59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
[2810]62
[15871]63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]65
[15871]66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[9957]67
[11090]68 if ($self->{'info_only'}) {
[15871]69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
[11090]71 return bless $self, $class;
72 }
73
[2810]74 my $parser = new XML::Parser('Style' => 'Stream',
[15871]75 'Pkg' => 'ReadXMLFile',
[14103]76 'PluginObj' => $self,
[2810]77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
[16822]79 'Entity' => \&Entity,
[2810]80 'Doctype' => \&Doctype,
[16822]81 'Default' => \&Default
[14103]82 });
[13148]83
[2810]84 $self->{'parser'} = $parser;
85
86 return bless $self, $class;
87}
88
[13192]89# the inheriting class must implement this method to tell whether to parse this doc type
90sub get_doctype {
91 my $self = shift(@_);
[13221]92 die "$self The inheriting class must implement get_doctype method";
[13192]93}
94
95
[9957]96sub apply_xslt
97{
98 my $self = shift @_;
99 my ($xslt,$filename) = @_;
100
101 my $outhandle = $self->{'outhandle'};
[2810]102
[9957]103 my $xslt_filename = $xslt;
104
105 if (! -e $xslt_filename) {
106 # Look in main site directory
107 my $gsdlhome = $ENV{'GSDLHOME'};
108 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
109 }
110
111 if (! -e $xslt_filename) {
112 # Look in collection directory
[11661]113 my $coldir = $ENV{'GSDLCOLLECTDIR'};
[9957]114 $xslt_filename = &util::filename_cat($coldir,$xslt);
115 }
116
117 if (! -e $xslt_filename) {
118 print $outhandle "Warning: Unable to find XSLT $xslt\n";
119 if (open(XMLIN,"<$filename")) {
120
[10254]121 my $untransformed_xml = "";
[9957]122 while (defined (my $line = <XMLIN>)) {
123
124 $untransformed_xml .= $line;
125 }
126 close(XMLIN);
127
128 return $untransformed_xml;
129 }
130 else {
131 print $outhandle "Error: Unable to open file $filename\n";
132 print $outhandle " $!\n";
133 return "";
134 }
135
136 }
137
138 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
139 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
140 my $xslt_base_cmd = "java -jar $jar_filename";
141 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
142
143 my $transformed_xml = "";
144
145 if (open(XSLT_IN,"$xslt_cmd |")) {
146 while (defined (my $line = <XSLT_IN>)) {
147
148 $transformed_xml .= $line;
149 }
150 close(XSLT_IN);
151 }
152 else {
[10254]153 print $outhandle "Error: Unable to run command $xslt_cmd\n";
[9957]154 print $outhandle " $!\n";
155 }
156
157 return $transformed_xml;
158
159}
160
[16392]161sub can_process_this_file {
162 my $self = shift(@_);
163 my ($filename) = @_;
164
[24348]165 if (-f $filename
166 && $self->SUPER::can_process_this_file($filename)
167 && $self->check_doctype($filename)) {
[16392]168 return 1; # its a file for us
169 }
170 return 0;
171}
172
[13192]173sub check_doctype {
[14103]174 my $self = shift (@_);
[13192]175
176 my ($filename) = @_;
[14103]177
[13192]178 if (open(XMLIN,"<$filename")) {
179 my $doctype = $self->get_doctype();
180 ## check whether the doctype has the same name as the root element tag
181 while (defined (my $line = <XMLIN>)) {
182 ## find the root element
[13226]183 if ($line =~ /<([\w\d:]+)[\s>]/){
[13192]184 my $root = $1;
185 if ($root !~ $doctype){
186 close(XMLIN);
187 return 0;
188 }
189 else {
190 close(XMLIN);
191 return 1;
192 }
193 }
194 }
195 close(XMLIN);
196 }
197
198 return undef; # haven't found a valid line
199
200}
[9957]201
[2810]202sub read {
[14103]203 my $self = shift (@_);
[2810]204
[16392]205 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[2810]206
[16392]207 # can we process this file??
208 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
209 return undef unless $self->can_process_this_file($filename_full_path);
210
[2810]211 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[12844]212 $self->{'base_dir'} = $base_dir;
[2810]213 $self->{'file'} = $file;
[15871]214 $self->{'filename'} = $filename_full_path;
[20830]215 $self->{'filename_no_path'} = $filename_no_path;
[2810]216 $self->{'processor'} = $processor;
[23349]217
[17289]218 # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
219 # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
[2810]220 $self->{'metadata'} = $metadata;
[9957]221
[17293]222 if ($self->parse_file($filename_full_path)) {
[17289]223 return 1; # processed the file
224 }
225 return -1;
226}
227
228
229sub parse_file {
230 my $self = shift (@_);
231 my ($filename_full_path, $file, $gli) = @_;
[2810]232 eval {
[9957]233 my $xslt = $self->{'xslt'};
234 if (defined $xslt && ($xslt ne "")) {
235 # perform xslt
[15871]236 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
[9957]237
238 # feed transformed file (now in memory as string) into XML parser
239 $self->{'parser'}->parse($transformed_xml);
240 }
241 else {
[15871]242 $self->{'parser'}->parsefile($filename_full_path);
[9957]243 }
[2810]244 };
[7900]245
[2810]246 if ($@) {
247
248 # parsefile may either croak somewhere in XML::Parser (e.g. because
[15871]249 # the document is not well formed) or die somewhere in ReadXMLFile or a
[2810]250 # derived plugin (e.g. because we're attempting to process a
251 # document whose DOCTYPE is not meant for this plugin). For the
252 # first case we'll print a warning and continue, for the second
253 # we'll just continue quietly
254
[10170]255 print STDERR "**** Error is: $@\n";
[7900]256
[2810]257 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
258 if (defined $msg) {
259 my $outhandle = $self->{'outhandle'};
260 my $plugin_name = ref ($self);
261 print $outhandle "$plugin_name failed to process $file ($msg)\n";
262 }
[7900]263
[3107]264 # reset ourself for the next document
265 $self->{'section_level'}=0;
[9584]266 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[7362]267 return -1; # error during processing
[2810]268 }
[17289]269 return 1; # parsing was successful
[2810]270}
271
272sub get_default_process_exp {
273 my $self = shift (@_);
274
275 return q^(?i)\.xml$^;
276}
277
[14103]278sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
279sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
280sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
281sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
282sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
283sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
284sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
285sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
286sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
287sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
[2810]288
289# This Char function overrides the one in XML::Parser::Stream to overcome a
290# problem where $expat->{Text} is treated as the return value, slowing
291# things down significantly in some cases.
292sub Char {
[9462]293 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
294 $_[0]->{'Text'} .= $_[1];
295 return undef;
[2810]296}
297
[18528]298
[2810]299# Called at the beginning of the XML document.
300sub xml_start_document {
301 my $self = shift(@_);
302 my ($expat) = @_;
303
304 $self->open_document();
305}
306
307# Called for XML declarations
308sub xml_xmldecl {
309 my $self = shift(@_);
310 my ($expat, $version, $encoding, $standalone) = @_;
311}
312
[2890]313# Called for XML entities
314sub xml_entity {
315 my $self = shift(@_);
316 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
317}
318
[2810]319# Called for DOCTYPE declarations - use die to bail out if this doctype
320# is not meant for this plugin
321sub xml_doctype {
322 my $self = shift(@_);
[13148]323
[2810]324 my ($expat, $name, $sysid, $pubid, $internal) = @_;
[15871]325 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
[2810]326}
327
[13148]328
[2810]329# Called for every start tag. The $_ variable will contain a copy of the
330# tag and the %_ variable will contain the element's attributes.
331sub xml_start_tag {
332 my $self = shift(@_);
333 my ($expat, $element) = @_;
334}
335
336# Called for every end tag. The $_ variable will contain a copy of the tag.
337sub xml_end_tag {
338 my $self = shift(@_);
339 my ($expat, $element) = @_;
340}
341
342# Called just before start or end tags with accumulated non-markup text in
343# the $_ variable.
344sub xml_text {
345 my $self = shift(@_);
346 my ($expat) = @_;
347}
348
349# Called for processing instructions. The $_ variable will contain a copy
350# of the pi.
351sub xml_pi {
352 my $self = shift(@_);
353 my ($expat, $target, $data) = @_;
354}
355
356# Called at the end of the XML document.
357sub xml_end_document {
358 my $self = shift(@_);
359 my ($expat) = @_;
360
361 $self->close_document();
362}
363
364# Called for any characters not handled by the above functions.
365sub xml_default {
366 my $self = shift(@_);
367 my ($expat, $text) = @_;
368}
369
370sub open_document {
371 my $self = shift(@_);
372
[23352]373 my $metadata = $self->{'metadata'};
374 my $filename_full_path = $self->{'filename'};
[23349]375
[2810]376 # create a new document
[23352]377 my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[23349]378
379 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
380
[23352]381 my $filename_no_path = $self->{'filename_no_path'};
382 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23349]383 my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding);
384
[23352]385 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
[20830]386
[15871]387 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
[2810]388}
389
390sub close_document {
391 my $self = shift(@_);
[8716]392 my $doc_obj = $self->{'doc_obj'};
[15871]393
394 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
395
[2810]396 # include any metadata passed in from previous plugins
397 # note that this metadata is associated with the top level section
[8716]398 $self->extra_metadata ($doc_obj,
399 $doc_obj->get_top_section(),
[2810]400 $self->{'metadata'});
401
402 # do any automatic metadata extraction
[8716]403 $self->auto_extract_metadata ($doc_obj);
[2810]404
405 # add an OID
[17026]406 $self->add_OID($doc_obj);
[2810]407
[7508]408 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]409 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
[5919]410
[2810]411 # process the document
[8716]412 $self->{'processor'}->process($doc_obj);
[2810]413
414 $self->{'num_processed'} ++;
[15871]415 undef $self->{'doc_obj'};
416 undef $doc_obj; # is this the same as above??
[2810]417}
418
4191;
420
[7900]421
422
423
Note: See TracBrowser for help on using the repository browser.