source: gsdl/trunk/perllib/plugins/ReadXMLFile.pm@ 18528

Last change on this file since 18528 was 18528, checked in by davidb, 15 years ago

OIDmetadata wasn't supported in collect.cfg, but OIDtype was. Now rectified. Also introduced OIDcount as a file saved in the archives folder to help doc.pm use the correct value when working incrementally

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
RevLine 
[2810]1###########################################################################
2#
[15871]3# ReadXMLFile.pm -- base class for XML plugins
[2810]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[15871]26package ReadXMLFile;
[2810]27
[15871]28use BasePlugin;
[2810]29use doc;
[10254]30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
[2810]32
33sub BEGIN {
[15871]34 @ReadXMLFile::ISA = ('BasePlugin');
[2810]35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
[8069]38use XMLParser;
[2810]39
[4744]40my $arguments =
41 [ { 'name' => "process_exp",
[15871]42 'desc' => "{BasePlugin.process_exp}",
[6408]43 'type' => "regexp",
[4744]44 'deft' => &get_default_process_exp(),
[9957]45 'reqd' => "no" },
46 { 'name' => "xslt",
[15871]47 'desc' => "{ReadXMLFile.xslt}",
[9957]48 'type' => "string",
49 'deft' => "",
[4744]50 'reqd' => "no" } ];
51
[15871]52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
[7244]54 'abstract' => "yes",
[4744]55 'inherits' => "yes",
56 'args' => $arguments };
[3540]57
[2810]58sub new {
[10218]59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
[2810]62
[15871]63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]65
[15871]66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[9957]67
[11090]68 if ($self->{'info_only'}) {
[15871]69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
[11090]71 return bless $self, $class;
72 }
73
[2810]74 my $parser = new XML::Parser('Style' => 'Stream',
[15871]75 'Pkg' => 'ReadXMLFile',
[14103]76 'PluginObj' => $self,
[16696]77 'Namespaces' => 1, # strip out namespaces
[2810]78 'Handlers' => {'Char' => \&Char,
79 'XMLDecl' => \&XMLDecl,
[16822]80 'Entity' => \&Entity,
[2810]81 'Doctype' => \&Doctype,
[16822]82 'Default' => \&Default
[14103]83 });
[13148]84
[2810]85 $self->{'parser'} = $parser;
86
87 return bless $self, $class;
88}
89
[13192]90# the inheriting class must implement this method to tell whether to parse this doc type
91sub get_doctype {
92 my $self = shift(@_);
[13221]93 die "$self The inheriting class must implement get_doctype method";
[13192]94}
95
96
[9957]97sub apply_xslt
98{
99 my $self = shift @_;
100 my ($xslt,$filename) = @_;
101
102 my $outhandle = $self->{'outhandle'};
[2810]103
[9957]104 my $xslt_filename = $xslt;
105
106 if (! -e $xslt_filename) {
107 # Look in main site directory
108 my $gsdlhome = $ENV{'GSDLHOME'};
109 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
110 }
111
112 if (! -e $xslt_filename) {
113 # Look in collection directory
[11661]114 my $coldir = $ENV{'GSDLCOLLECTDIR'};
[9957]115 $xslt_filename = &util::filename_cat($coldir,$xslt);
116 }
117
118 if (! -e $xslt_filename) {
119 print $outhandle "Warning: Unable to find XSLT $xslt\n";
120 if (open(XMLIN,"<$filename")) {
121
[10254]122 my $untransformed_xml = "";
[9957]123 while (defined (my $line = <XMLIN>)) {
124
125 $untransformed_xml .= $line;
126 }
127 close(XMLIN);
128
129 return $untransformed_xml;
130 }
131 else {
132 print $outhandle "Error: Unable to open file $filename\n";
133 print $outhandle " $!\n";
134 return "";
135 }
136
137 }
138
139 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
140 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
141 my $xslt_base_cmd = "java -jar $jar_filename";
142 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
143
144 my $transformed_xml = "";
145
146 if (open(XSLT_IN,"$xslt_cmd |")) {
147 while (defined (my $line = <XSLT_IN>)) {
148
149 $transformed_xml .= $line;
150 }
151 close(XSLT_IN);
152 }
153 else {
[10254]154 print $outhandle "Error: Unable to run command $xslt_cmd\n";
[9957]155 print $outhandle " $!\n";
156 }
157
158 return $transformed_xml;
159
160}
161
[16392]162sub can_process_this_file {
163 my $self = shift(@_);
164 my ($filename) = @_;
165
[17322]166 if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
[16392]167 return 1; # its a file for us
168 }
169 return 0;
170}
171
[13192]172sub check_doctype {
[14103]173 my $self = shift (@_);
[13192]174
175 my ($filename) = @_;
[14103]176
[13192]177 if (open(XMLIN,"<$filename")) {
178 my $doctype = $self->get_doctype();
179 ## check whether the doctype has the same name as the root element tag
180 while (defined (my $line = <XMLIN>)) {
181 ## find the root element
[13226]182 if ($line =~ /<([\w\d:]+)[\s>]/){
[13192]183 my $root = $1;
184 if ($root !~ $doctype){
185 close(XMLIN);
186 return 0;
187 }
188 else {
189 close(XMLIN);
190 return 1;
191 }
192 }
193 }
194 close(XMLIN);
195 }
196
197 return undef; # haven't found a valid line
198
199}
[9957]200
[2810]201sub read {
[14103]202 my $self = shift (@_);
[2810]203
[16392]204 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[2810]205
[16392]206 # can we process this file??
207 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
208 return undef unless $self->can_process_this_file($filename_full_path);
209
[2810]210 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[12844]211 $self->{'base_dir'} = $base_dir;
[2810]212 $self->{'file'} = $file;
[15871]213 $self->{'filename'} = $filename_full_path;
[2810]214 $self->{'processor'} = $processor;
[17289]215 # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
216 # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
[2810]217 $self->{'metadata'} = $metadata;
[9957]218
[17293]219 if ($self->parse_file($filename_full_path)) {
[17289]220 return 1; # processed the file
221 }
222 return -1;
223}
224
225
226sub parse_file {
227 my $self = shift (@_);
228 my ($filename_full_path, $file, $gli) = @_;
[2810]229 eval {
[9957]230 my $xslt = $self->{'xslt'};
231 if (defined $xslt && ($xslt ne "")) {
232 # perform xslt
[15871]233 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
[9957]234
235 # feed transformed file (now in memory as string) into XML parser
236 $self->{'parser'}->parse($transformed_xml);
237 }
238 else {
[15871]239 $self->{'parser'}->parsefile($filename_full_path);
[9957]240 }
[2810]241 };
[7900]242
[2810]243 if ($@) {
244
245 # parsefile may either croak somewhere in XML::Parser (e.g. because
[15871]246 # the document is not well formed) or die somewhere in ReadXMLFile or a
[2810]247 # derived plugin (e.g. because we're attempting to process a
248 # document whose DOCTYPE is not meant for this plugin). For the
249 # first case we'll print a warning and continue, for the second
250 # we'll just continue quietly
251
[10170]252 print STDERR "**** Error is: $@\n";
[7900]253
[2810]254 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
255 if (defined $msg) {
256 my $outhandle = $self->{'outhandle'};
257 my $plugin_name = ref ($self);
258 print $outhandle "$plugin_name failed to process $file ($msg)\n";
259 }
[7900]260
[3107]261 # reset ourself for the next document
262 $self->{'section_level'}=0;
[9584]263 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[7362]264 return -1; # error during processing
[2810]265 }
[17289]266 return 1; # parsing was successful
[2810]267}
268
269sub get_default_process_exp {
270 my $self = shift (@_);
271
272 return q^(?i)\.xml$^;
273}
274
[14103]275sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
276sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
277sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
278sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
279sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
280sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
281sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
282sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
283sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
284sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
[2810]285
286# This Char function overrides the one in XML::Parser::Stream to overcome a
287# problem where $expat->{Text} is treated as the return value, slowing
288# things down significantly in some cases.
289sub Char {
[9462]290 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
291 $_[0]->{'Text'} .= $_[1];
292 return undef;
[2810]293}
294
[18528]295
[2810]296# Called at the beginning of the XML document.
297sub xml_start_document {
298 my $self = shift(@_);
299 my ($expat) = @_;
300
301 $self->open_document();
302}
303
304# Called for XML declarations
305sub xml_xmldecl {
306 my $self = shift(@_);
307 my ($expat, $version, $encoding, $standalone) = @_;
308}
309
[2890]310# Called for XML entities
311sub xml_entity {
312 my $self = shift(@_);
313 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
314}
315
[2810]316# Called for DOCTYPE declarations - use die to bail out if this doctype
317# is not meant for this plugin
318sub xml_doctype {
319 my $self = shift(@_);
[13148]320
[2810]321 my ($expat, $name, $sysid, $pubid, $internal) = @_;
[15871]322 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
[2810]323}
324
[13148]325
[2810]326# Called for every start tag. The $_ variable will contain a copy of the
327# tag and the %_ variable will contain the element's attributes.
328sub xml_start_tag {
329 my $self = shift(@_);
330 my ($expat, $element) = @_;
331}
332
333# Called for every end tag. The $_ variable will contain a copy of the tag.
334sub xml_end_tag {
335 my $self = shift(@_);
336 my ($expat, $element) = @_;
337}
338
339# Called just before start or end tags with accumulated non-markup text in
340# the $_ variable.
341sub xml_text {
342 my $self = shift(@_);
343 my ($expat) = @_;
344}
345
346# Called for processing instructions. The $_ variable will contain a copy
347# of the pi.
348sub xml_pi {
349 my $self = shift(@_);
350 my ($expat, $target, $data) = @_;
351}
352
353# Called at the end of the XML document.
354sub xml_end_document {
355 my $self = shift(@_);
356 my ($expat) = @_;
357
358 $self->close_document();
359}
360
361# Called for any characters not handled by the above functions.
362sub xml_default {
363 my $self = shift(@_);
364 my ($expat, $text) = @_;
365}
366
367sub open_document {
368 my $self = shift(@_);
369
370 # create a new document
[18327]371 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
[15871]372 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
373
374 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
[2810]375}
376
377sub close_document {
378 my $self = shift(@_);
[8716]379 my $doc_obj = $self->{'doc_obj'};
[15871]380
381 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
382
[2810]383 # include any metadata passed in from previous plugins
384 # note that this metadata is associated with the top level section
[8716]385 $self->extra_metadata ($doc_obj,
386 $doc_obj->get_top_section(),
[2810]387 $self->{'metadata'});
388
389 # do any automatic metadata extraction
[8716]390 $self->auto_extract_metadata ($doc_obj);
[2810]391
392 # add an OID
[17026]393 $self->add_OID($doc_obj);
[2810]394
[7508]395 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]396 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
[5919]397
[2810]398 # process the document
[8716]399 $self->{'processor'}->process($doc_obj);
[2810]400
401 $self->{'num_processed'} ++;
[15871]402 undef $self->{'doc_obj'};
403 undef $doc_obj; # is this the same as above??
[2810]404}
405
4061;
407
[7900]408
409
410
Note: See TracBrowser for help on using the repository browser.