source: trunk/gsdl/perllib/plugins/XMLPlug.pm@ 13226

Last change on this file since 13226 was 13226, checked in by shaoqun, 17 years ago

should allow element names with digits

  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
RevLine 
[2810]1###########################################################################
2#
3# XMLPlug.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package XMLPlug;
27
28use BasPlug;
29use doc;
[10254]30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
[2810]32
33sub BEGIN {
[8716]34 @XMLPlug::ISA = ('BasPlug');
[2810]35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
[8069]38use XMLParser;
[2810]39
[4744]40my $arguments =
41 [ { 'name' => "process_exp",
[4873]42 'desc' => "{BasPlug.process_exp}",
[6408]43 'type' => "regexp",
[4744]44 'deft' => &get_default_process_exp(),
[9957]45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{XMLPlug.xslt}",
48 'type' => "string",
49 'deft' => "",
[4744]50 'reqd' => "no" } ];
51
[3540]52my $options = { 'name' => "XMLPlug",
[5680]53 'desc' => "{XMLPlug.desc}",
[7244]54 'abstract' => "yes",
[4744]55 'inherits' => "yes",
56 'args' => $arguments };
[3540]57
58
[10170]59our ($self);
[2810]60sub new {
[10218]61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
[2810]64
[10218]65 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
66 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
67
[2810]68 # $self is global for use within subroutines called by XML::Parser
[12169]69 $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
[9957]70
[11090]71 if ($self->{'info_only'}) {
72 # don't worry about any options etc
73 return bless $self, $class;
74 }
75
[2810]76 my $parser = new XML::Parser('Style' => 'Stream',
77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
[2890]79 'Entity' => \&Entity,
[2810]80 'Doctype' => \&Doctype,
[13148]81 'Default' => \&Default,
82 });
83
[2810]84 $self->{'parser'} = $parser;
85
86 return bless $self, $class;
87}
88
[13192]89# the inheriting class must implement this method to tell whether to parse this doc type
90sub get_doctype {
91 my $self = shift(@_);
[13221]92 die "$self The inheriting class must implement get_doctype method";
[13192]93}
94
95
[9957]96sub apply_xslt
97{
98 my $self = shift @_;
99 my ($xslt,$filename) = @_;
100
101 my $outhandle = $self->{'outhandle'};
[2810]102
[9957]103 my $xslt_filename = $xslt;
104
105 if (! -e $xslt_filename) {
106 # Look in main site directory
107 my $gsdlhome = $ENV{'GSDLHOME'};
108 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
109 }
110
111 if (! -e $xslt_filename) {
112 # Look in collection directory
[11661]113 my $coldir = $ENV{'GSDLCOLLECTDIR'};
[9957]114 $xslt_filename = &util::filename_cat($coldir,$xslt);
115 }
116
117 if (! -e $xslt_filename) {
118 print $outhandle "Warning: Unable to find XSLT $xslt\n";
119 if (open(XMLIN,"<$filename")) {
120
[10254]121 my $untransformed_xml = "";
[9957]122 while (defined (my $line = <XMLIN>)) {
123
124 $untransformed_xml .= $line;
125 }
126 close(XMLIN);
127
128 return $untransformed_xml;
129 }
130 else {
131 print $outhandle "Error: Unable to open file $filename\n";
132 print $outhandle " $!\n";
133 return "";
134 }
135
136 }
137
138 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
139 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
140 my $xslt_base_cmd = "java -jar $jar_filename";
141 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
142
143 my $transformed_xml = "";
144
145 if (open(XSLT_IN,"$xslt_cmd |")) {
146 while (defined (my $line = <XSLT_IN>)) {
147
148 $transformed_xml .= $line;
149 }
150 close(XSLT_IN);
151 }
152 else {
[10254]153 print $outhandle "Error: Unable to run command $xslt_cmd\n";
[9957]154 print $outhandle " $!\n";
155 }
156
157 return $transformed_xml;
158
159}
160
[13192]161sub check_doctype {
162 $self = shift (@_);
163
164 my ($filename) = @_;
165
166 if (open(XMLIN,"<$filename")) {
167 my $doctype = $self->get_doctype();
168 ## check whether the doctype has the same name as the root element tag
169 while (defined (my $line = <XMLIN>)) {
170 ## find the root element
[13226]171 if ($line =~ /<([\w\d:]+)[\s>]/){
[13192]172 my $root = $1;
173 if ($root !~ $doctype){
174 close(XMLIN);
175 return 0;
176 }
177 else {
178 close(XMLIN);
179 return 1;
180 }
181 }
182 }
183 close(XMLIN);
184 }
185
186 return undef; # haven't found a valid line
187
188}
[9957]189
[13192]190# because we are not just using process_exp to determine whether to process or not, we need to implement this too, so that a file can be passed down if we are not actually processing it
191sub metadata_read {
192 $self = shift (@_);
193
194 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
195
196 my $result = $self->SUPER::metadata_read($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli);
197
198 if (defined $result) {
199 # we think we are processing this, but check that we actually are
200 my $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
201
202 if ($self->check_doctype($filename)) {
203 return $result;
204 }
205 }
206 return undef;
207}
208
[2810]209sub read {
[9957]210 # $self must be global to work with XML callback routines.
[2896]211 $self = shift (@_);
[2810]212
[9853]213 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[2810]214
[11090]215 # Make sure we're processing the correct file, do blocking etc
216 my ($block_status,$filename) = $self->read_block(@_);
217 return $block_status if ((!defined $block_status) || ($block_status==0));
[2810]218
[13192]219 ## check the doctype to see whether we really want to process the file
220 if (!$self->check_doctype($filename)) {
221 # this file is not for us
222 return undef;
223 }
224
[2810]225 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[12844]226 $self->{'base_dir'} = $base_dir;
[2810]227 $self->{'file'} = $file;
228 $self->{'filename'} = $filename;
229 $self->{'processor'} = $processor;
230 $self->{'metadata'} = $metadata;
[9957]231
[2810]232 eval {
[9957]233 my $xslt = $self->{'xslt'};
234 if (defined $xslt && ($xslt ne "")) {
235 # perform xslt
236 my $transformed_xml = $self->apply_xslt($xslt,$filename);
237
238 # feed transformed file (now in memory as string) into XML parser
239 $self->{'parser'}->parse($transformed_xml);
240 }
241 else {
242 $self->{'parser'}->parsefile($filename);
243 }
[2810]244 };
[7900]245
[2810]246 if ($@) {
247
248 # parsefile may either croak somewhere in XML::Parser (e.g. because
249 # the document is not well formed) or die somewhere in XMLPlug or a
250 # derived plugin (e.g. because we're attempting to process a
251 # document whose DOCTYPE is not meant for this plugin). For the
252 # first case we'll print a warning and continue, for the second
253 # we'll just continue quietly
254
[10170]255 print STDERR "**** Error is: $@\n";
[7900]256
[2810]257 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
258 if (defined $msg) {
259 my $outhandle = $self->{'outhandle'};
260 my $plugin_name = ref ($self);
261 print $outhandle "$plugin_name failed to process $file ($msg)\n";
262 }
[7900]263
[3107]264 # reset ourself for the next document
265 $self->{'section_level'}=0;
[9584]266 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[7362]267 return -1; # error during processing
[2810]268 }
[9957]269
[13221]270
[2810]271 return 1; # processed the file
272}
273
[10170]274# the following two methods are for if you want to do the parsing from a
275# plugin that inherits from this. it seems that you can't call the parse
276# methods directly. WHY???
277sub parse_file {
278 $self = shift (@_);
279 my ($filename) = @_;
280 $self->{'parser'}->parsefile($filename);
281}
282
283sub parse_string {
284 $self = shift (@_);
285 my ($xml_string) = @_;
286 $self->{'parser'}->parse($xml_string);
287}
288
[2810]289sub get_default_process_exp {
290 my $self = shift (@_);
291
292 return q^(?i)\.xml$^;
293}
294
295sub StartDocument {$self->xml_start_document(@_);}
296sub XMLDecl {$self->xml_xmldecl(@_);}
[2890]297sub Entity {$self->xml_entity(@_);}
[2810]298sub Doctype {$self->xml_doctype(@_);}
299sub StartTag {$self->xml_start_tag(@_);}
300sub EndTag {$self->xml_end_tag(@_);}
301sub Text {$self->xml_text(@_);}
302sub PI {$self->xml_pi(@_);}
303sub EndDocument {$self->xml_end_document(@_);}
304sub Default {$self->xml_default(@_);}
305
306# This Char function overrides the one in XML::Parser::Stream to overcome a
307# problem where $expat->{Text} is treated as the return value, slowing
308# things down significantly in some cases.
309sub Char {
[9462]310 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
311 $_[0]->{'Text'} .= $_[1];
312 return undef;
[2810]313}
314
315# Called at the beginning of the XML document.
316sub xml_start_document {
317 my $self = shift(@_);
318 my ($expat) = @_;
319
320 $self->open_document();
321}
322
323# Called for XML declarations
324sub xml_xmldecl {
325 my $self = shift(@_);
326 my ($expat, $version, $encoding, $standalone) = @_;
327}
328
[2890]329# Called for XML entities
330sub xml_entity {
331 my $self = shift(@_);
332 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
333}
334
[2810]335# Called for DOCTYPE declarations - use die to bail out if this doctype
336# is not meant for this plugin
337sub xml_doctype {
338 my $self = shift(@_);
[13148]339
[2810]340 my ($expat, $name, $sysid, $pubid, $internal) = @_;
341 die "XMLPlug Cannot process XML document with DOCTYPE of $name";
342}
343
[13148]344
[2810]345# Called for every start tag. The $_ variable will contain a copy of the
346# tag and the %_ variable will contain the element's attributes.
347sub xml_start_tag {
348 my $self = shift(@_);
349 my ($expat, $element) = @_;
350}
351
352# Called for every end tag. The $_ variable will contain a copy of the tag.
353sub xml_end_tag {
354 my $self = shift(@_);
355 my ($expat, $element) = @_;
356}
357
358# Called just before start or end tags with accumulated non-markup text in
359# the $_ variable.
360sub xml_text {
361 my $self = shift(@_);
362 my ($expat) = @_;
363}
364
365# Called for processing instructions. The $_ variable will contain a copy
366# of the pi.
367sub xml_pi {
368 my $self = shift(@_);
369 my ($expat, $target, $data) = @_;
370}
371
372# Called at the end of the XML document.
373sub xml_end_document {
374 my $self = shift(@_);
375 my ($expat) = @_;
376
377 $self->close_document();
378}
379
380# Called for any characters not handled by the above functions.
381sub xml_default {
382 my $self = shift(@_);
383 my ($expat, $text) = @_;
384}
385
386sub open_document {
387 my $self = shift(@_);
388
389 # create a new document
390 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
[12270]391 $self->{'doc_obj'}->set_OIDtype ($self->{'processor'}->{'OIDtype'}, $self->{'processor'}->{'OIDmetadata'});
[2810]392}
393
394sub close_document {
395 my $self = shift(@_);
[8716]396 my $doc_obj = $self->{'doc_obj'};
[2810]397 # include any metadata passed in from previous plugins
398 # note that this metadata is associated with the top level section
[8716]399 $self->extra_metadata ($doc_obj,
400 $doc_obj->get_top_section(),
[2810]401 $self->{'metadata'});
402
403 # do any automatic metadata extraction
[8716]404 $self->auto_extract_metadata ($doc_obj);
[2810]405
406 # add an OID
[8716]407 $doc_obj->set_OID();
[2810]408
[7508]409 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]410 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
[5919]411
[2810]412 # process the document
[8716]413 $self->{'processor'}->process($doc_obj);
[2810]414
415 $self->{'num_processed'} ++;
416}
417
4181;
419
[7900]420
421
422
Note: See TracBrowser for help on using the repository browser.