source: gsdl/trunk/perllib/plugins/ReadXMLFile.pm@ 16698

Last change on this file since 16698 was 16696, checked in by kjdon, 16 years ago

added an option to XML parser to strip out namespaces. did this so MARCXMLPlugin can work with namespaces. Not sure if it is the right thing to do or not. Currently we don't do anything with namespaces but presumably someone might want to.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @ReadXMLFile::ISA = ('BasePlugin');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{ReadXMLFile.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
65
66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
67
68 if ($self->{'info_only'}) {
69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
71 return bless $self, $class;
72 }
73
74 my $parser = new XML::Parser('Style' => 'Stream',
75 'Pkg' => 'ReadXMLFile',
76 'PluginObj' => $self,
77 'Namespaces' => 1, # strip out namespaces
78 'Handlers' => {'Char' => \&Char,
79 'XMLDecl' => \&XMLDecl,
80 'Entity' => \&Entity,
81 'Doctype' => \&Doctype,
82 'Default' => \&Default,
83 });
84
85 $self->{'parser'} = $parser;
86
87 return bless $self, $class;
88}
89
90# the inheriting class must implement this method to tell whether to parse this doc type
91sub get_doctype {
92 my $self = shift(@_);
93 die "$self The inheriting class must implement get_doctype method";
94}
95
96
97sub apply_xslt
98{
99 my $self = shift @_;
100 my ($xslt,$filename) = @_;
101
102 my $outhandle = $self->{'outhandle'};
103
104 my $xslt_filename = $xslt;
105
106 if (! -e $xslt_filename) {
107 # Look in main site directory
108 my $gsdlhome = $ENV{'GSDLHOME'};
109 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
110 }
111
112 if (! -e $xslt_filename) {
113 # Look in collection directory
114 my $coldir = $ENV{'GSDLCOLLECTDIR'};
115 $xslt_filename = &util::filename_cat($coldir,$xslt);
116 }
117
118 if (! -e $xslt_filename) {
119 print $outhandle "Warning: Unable to find XSLT $xslt\n";
120 if (open(XMLIN,"<$filename")) {
121
122 my $untransformed_xml = "";
123 while (defined (my $line = <XMLIN>)) {
124
125 $untransformed_xml .= $line;
126 }
127 close(XMLIN);
128
129 return $untransformed_xml;
130 }
131 else {
132 print $outhandle "Error: Unable to open file $filename\n";
133 print $outhandle " $!\n";
134 return "";
135 }
136
137 }
138
139 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
140 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
141 my $xslt_base_cmd = "java -jar $jar_filename";
142 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
143
144 my $transformed_xml = "";
145
146 if (open(XSLT_IN,"$xslt_cmd |")) {
147 while (defined (my $line = <XSLT_IN>)) {
148
149 $transformed_xml .= $line;
150 }
151 close(XSLT_IN);
152 }
153 else {
154 print $outhandle "Error: Unable to run command $xslt_cmd\n";
155 print $outhandle " $!\n";
156 }
157
158 return $transformed_xml;
159
160}
161
162sub can_process_this_file {
163 my $self = shift(@_);
164 my ($filename) = @_;
165
166 if ($self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
167 return 1; # its a file for us
168 }
169 return 0;
170}
171
172sub check_doctype {
173 my $self = shift (@_);
174
175 my ($filename) = @_;
176
177 if (open(XMLIN,"<$filename")) {
178 my $doctype = $self->get_doctype();
179 ## check whether the doctype has the same name as the root element tag
180 while (defined (my $line = <XMLIN>)) {
181 ## find the root element
182 if ($line =~ /<([\w\d:]+)[\s>]/){
183 my $root = $1;
184 if ($root !~ $doctype){
185 close(XMLIN);
186 return 0;
187 }
188 else {
189 close(XMLIN);
190 return 1;
191 }
192 }
193 }
194 close(XMLIN);
195 }
196
197 return undef; # haven't found a valid line
198
199}
200
201
202# we need to implement read cos we are not just using process_exp to determine
203# whether to process this or not.
204sub read {
205 my $self = shift (@_);
206
207 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
208
209 # can we process this file??
210 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
211 return undef unless $self->can_process_this_file($filename_full_path);
212
213 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
214 $self->{'base_dir'} = $base_dir;
215 $self->{'file'} = $file;
216 $self->{'filename'} = $filename_full_path;
217 $self->{'processor'} = $processor;
218 $self->{'metadata'} = $metadata;
219
220 eval {
221 my $xslt = $self->{'xslt'};
222 if (defined $xslt && ($xslt ne "")) {
223 # perform xslt
224 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
225
226 # feed transformed file (now in memory as string) into XML parser
227 $self->{'parser'}->parse($transformed_xml);
228 }
229 else {
230 $self->{'parser'}->parsefile($filename_full_path);
231 }
232 };
233
234 if ($@) {
235
236 # parsefile may either croak somewhere in XML::Parser (e.g. because
237 # the document is not well formed) or die somewhere in ReadXMLFile or a
238 # derived plugin (e.g. because we're attempting to process a
239 # document whose DOCTYPE is not meant for this plugin). For the
240 # first case we'll print a warning and continue, for the second
241 # we'll just continue quietly
242
243 print STDERR "**** Error is: $@\n";
244
245 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
246 if (defined $msg) {
247 my $outhandle = $self->{'outhandle'};
248 my $plugin_name = ref ($self);
249 print $outhandle "$plugin_name failed to process $file ($msg)\n";
250 }
251
252 # reset ourself for the next document
253 $self->{'section_level'}=0;
254 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
255 return -1; # error during processing
256 }
257
258
259 return 1; # processed the file
260}
261
262
263sub get_default_process_exp {
264 my $self = shift (@_);
265
266 return q^(?i)\.xml$^;
267}
268
269sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
270sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
271sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
272sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
273sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
274sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
275sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
276sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
277sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
278sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
279
280# This Char function overrides the one in XML::Parser::Stream to overcome a
281# problem where $expat->{Text} is treated as the return value, slowing
282# things down significantly in some cases.
283sub Char {
284 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
285 $_[0]->{'Text'} .= $_[1];
286 return undef;
287}
288
289# Called at the beginning of the XML document.
290sub xml_start_document {
291 my $self = shift(@_);
292 my ($expat) = @_;
293
294 $self->open_document();
295}
296
297# Called for XML declarations
298sub xml_xmldecl {
299 my $self = shift(@_);
300 my ($expat, $version, $encoding, $standalone) = @_;
301}
302
303# Called for XML entities
304sub xml_entity {
305 my $self = shift(@_);
306 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
307}
308
309# Called for DOCTYPE declarations - use die to bail out if this doctype
310# is not meant for this plugin
311sub xml_doctype {
312 my $self = shift(@_);
313
314 my ($expat, $name, $sysid, $pubid, $internal) = @_;
315 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
316}
317
318
319# Called for every start tag. The $_ variable will contain a copy of the
320# tag and the %_ variable will contain the element's attributes.
321sub xml_start_tag {
322 my $self = shift(@_);
323 my ($expat, $element) = @_;
324}
325
326# Called for every end tag. The $_ variable will contain a copy of the tag.
327sub xml_end_tag {
328 my $self = shift(@_);
329 my ($expat, $element) = @_;
330}
331
332# Called just before start or end tags with accumulated non-markup text in
333# the $_ variable.
334sub xml_text {
335 my $self = shift(@_);
336 my ($expat) = @_;
337}
338
339# Called for processing instructions. The $_ variable will contain a copy
340# of the pi.
341sub xml_pi {
342 my $self = shift(@_);
343 my ($expat, $target, $data) = @_;
344}
345
346# Called at the end of the XML document.
347sub xml_end_document {
348 my $self = shift(@_);
349 my ($expat) = @_;
350
351 $self->close_document();
352}
353
354# Called for any characters not handled by the above functions.
355sub xml_default {
356 my $self = shift(@_);
357 my ($expat, $text) = @_;
358}
359
360sub open_document {
361 my $self = shift(@_);
362
363 # create a new document
364 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
365 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
366 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
367
368 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
369}
370
371sub close_document {
372 my $self = shift(@_);
373 my $doc_obj = $self->{'doc_obj'};
374
375 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
376
377 # include any metadata passed in from previous plugins
378 # note that this metadata is associated with the top level section
379 $self->extra_metadata ($doc_obj,
380 $doc_obj->get_top_section(),
381 $self->{'metadata'});
382
383 # do any automatic metadata extraction
384 $self->auto_extract_metadata ($doc_obj);
385
386 # add an OID
387 $self->add_OID();
388
389 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
390 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
391
392 # process the document
393 $self->{'processor'}->process($doc_obj);
394
395 $self->{'num_processed'} ++;
396 undef $self->{'doc_obj'};
397 undef $doc_obj; # is this the same as above??
398}
399
4001;
401
402
403
404
Note: See TracBrowser for help on using the repository browser.