source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/ReadXMLFile.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 11.8 KB
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @ReadXMLFile::ISA = ('BasePlugin');
35
36 # ensure Greenstone's CPAN is on the path (but check first) [jmt12]
37 my $inc_paths = join(':', @INC);
38 my $gsdl_perllib_path = $ENV{'GSDLHOME'} . '/perllib/cpan';
39 if ($inc_paths !~ /$gsdl_perllib_path/)
40 {
41 unshift (@INC, $gsdl_perllib_path);
42 }
43}
44
45use XMLParser;
46
47my $arguments =
48 [ { 'name' => "process_exp",
49 'desc' => "{BasePlugin.process_exp}",
50 'type' => "regexp",
51 'deft' => &get_default_process_exp(),
52 'reqd' => "no" },
53 { 'name' => "xslt",
54 'desc' => "{ReadXMLFile.xslt}",
55 'type' => "string",
56 'deft' => "",
57 'reqd' => "no" } ];
58
59my $options = { 'name' => "ReadXMLFile",
60 'desc' => "{ReadXMLFile.desc}",
61 'abstract' => "yes",
62 'inherits' => "yes",
63 'args' => $arguments };
64
65sub new {
66 my ($class) = shift (@_);
67 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
68 push(@$pluginlist, $class);
69
70 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
71 push(@{$hashArgOptLists->{"OptList"}},$options);
72
73 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
74
75 if ($self->{'info_only'}) {
76 # don't worry about creating the XML parser as all we want is the
77 # list of plugin options
78 return bless $self, $class;
79 }
80
81 my $parser = new XML::Parser('Style' => 'Stream',
82 'Pkg' => 'ReadXMLFile',
83 'PluginObj' => $self,
84 'Handlers' => {'Char' => \&Char,
85 'XMLDecl' => \&XMLDecl,
86 'Entity' => \&Entity,
87 'Doctype' => \&Doctype,
88 'Default' => \&Default
89 });
90
91 $self->{'parser'} = $parser;
92
93 return bless $self, $class;
94}
95
96# the inheriting class must implement this method to tell whether to parse this doc type
97sub get_doctype {
98 my $self = shift(@_);
99 die "$self The inheriting class must implement get_doctype method";
100}
101
102
103sub apply_xslt
104{
105 my $self = shift @_;
106 my ($xslt,$filename) = @_;
107
108 my $outhandle = $self->{'outhandle'};
109
110 my $xslt_filename = $xslt;
111
112 if (! -e $xslt_filename) {
113 # Look in main site directory
114 my $gsdlhome = $ENV{'GSDLHOME'};
115 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
116 }
117
118 if (! -e $xslt_filename) {
119 # Look in collection directory
120 my $coldir = $ENV{'GSDLCOLLECTDIR'};
121 $xslt_filename = &util::filename_cat($coldir,$xslt);
122 }
123
124 if (! -e $xslt_filename) {
125 print $outhandle "Warning: Unable to find XSLT $xslt\n";
126 if (open(XMLIN,"<$filename")) {
127
128 my $untransformed_xml = "";
129 while (defined (my $line = <XMLIN>)) {
130
131 $untransformed_xml .= $line;
132 }
133 close(XMLIN);
134
135 return $untransformed_xml;
136 }
137 else {
138 print $outhandle "Error: Unable to open file $filename\n";
139 print $outhandle " $!\n";
140 return "";
141 }
142
143 }
144
145 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
146 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
147 my $xslt_base_cmd = "java -jar $jar_filename";
148 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
149
150 my $transformed_xml = "";
151
152 if (open(XSLT_IN,"$xslt_cmd |")) {
153 while (defined (my $line = <XSLT_IN>)) {
154
155 $transformed_xml .= $line;
156 }
157 close(XSLT_IN);
158 }
159 else {
160 print $outhandle "Error: Unable to run command $xslt_cmd\n";
161 print $outhandle " $!\n";
162 }
163
164 return $transformed_xml;
165
166}
167
168sub can_process_this_file {
169 my $self = shift(@_);
170 my ($filename) = @_;
171
172 if (-f $filename
173 && $self->SUPER::can_process_this_file($filename)
174 && $self->check_doctype($filename)) {
175 return 1; # its a file for us
176 }
177 return 0;
178}
179
180sub check_doctype {
181 my $self = shift (@_);
182
183 my ($filename) = @_;
184
185 if (open(XMLIN,"<$filename")) {
186 my $doctype = $self->get_doctype();
187 ## check whether the doctype has the same name as the root element tag
188 while (defined (my $line = <XMLIN>)) {
189 ## find the root element
190 if ($line =~ /<([\w\d:]+)[\s>]/){
191 my $root = $1;
192 if ($root !~ $doctype){
193 close(XMLIN);
194 return 0;
195 }
196 else {
197 close(XMLIN);
198 return 1;
199 }
200 }
201 }
202 close(XMLIN);
203 }
204
205 return undef; # haven't found a valid line
206
207}
208
209sub read {
210 my $self = shift (@_);
211
212 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
213
214 # can we process this file??
215 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
216 return undef unless $self->can_process_this_file($filename_full_path);
217
218 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
219 $self->{'base_dir'} = $base_dir;
220 $self->{'file'} = $file;
221 $self->{'filename'} = $filename_full_path;
222 $self->{'filename_no_path'} = $filename_no_path;
223 $self->{'processor'} = $processor;
224
225 # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
226 # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
227 $self->{'metadata'} = $metadata;
228
229 if ($self->parse_file($filename_full_path)) {
230 return 1; # processed the file
231 }
232 return -1;
233}
234
235
236sub parse_file {
237 my $self = shift (@_);
238 my ($filename_full_path, $file, $gli) = @_;
239 eval {
240 my $xslt = $self->{'xslt'};
241 if (defined $xslt && ($xslt ne "")) {
242 # perform xslt
243 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
244
245 # feed transformed file (now in memory as string) into XML parser
246 $self->{'parser'}->parse($transformed_xml);
247 }
248 else {
249 $self->{'parser'}->parsefile($filename_full_path);
250 }
251 };
252
253 if ($@) {
254
255 # parsefile may either croak somewhere in XML::Parser (e.g. because
256 # the document is not well formed) or die somewhere in ReadXMLFile or a
257 # derived plugin (e.g. because we're attempting to process a
258 # document whose DOCTYPE is not meant for this plugin). For the
259 # first case we'll print a warning and continue, for the second
260 # we'll just continue quietly
261
262 print STDERR "**** Error is: $@\n";
263
264 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
265 if (defined $msg) {
266 my $outhandle = $self->{'outhandle'};
267 my $plugin_name = ref ($self);
268 print $outhandle "$plugin_name failed to process $file ($msg)\n";
269 }
270
271 # reset ourself for the next document
272 $self->{'section_level'}=0;
273 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
274 return -1; # error during processing
275 }
276 return 1; # parsing was successful
277}
278
279sub get_default_process_exp {
280 my $self = shift (@_);
281
282 return q^(?i)\.xml$^;
283}
284
285sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
286sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
287sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
288sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
289sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
290sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
291sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
292sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
293sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
294sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
295
296# This Char function overrides the one in XML::Parser::Stream to overcome a
297# problem where $expat->{Text} is treated as the return value, slowing
298# things down significantly in some cases.
299sub Char {
300 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
301 $_[0]->{'Text'} .= $_[1];
302 return undef;
303}
304
305
306# Called at the beginning of the XML document.
307sub xml_start_document {
308 my $self = shift(@_);
309 my ($expat) = @_;
310
311 $self->open_document();
312}
313
314# Called for XML declarations
315sub xml_xmldecl {
316 my $self = shift(@_);
317 my ($expat, $version, $encoding, $standalone) = @_;
318}
319
320# Called for XML entities
321sub xml_entity {
322 my $self = shift(@_);
323 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
324}
325
326# Called for DOCTYPE declarations - use die to bail out if this doctype
327# is not meant for this plugin
328sub xml_doctype {
329 my $self = shift(@_);
330
331 my ($expat, $name, $sysid, $pubid, $internal) = @_;
332 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
333}
334
335
336# Called for every start tag. The $_ variable will contain a copy of the
337# tag and the %_ variable will contain the element's attributes.
338sub xml_start_tag {
339 my $self = shift(@_);
340 my ($expat, $element) = @_;
341}
342
343# Called for every end tag. The $_ variable will contain a copy of the tag.
344sub xml_end_tag {
345 my $self = shift(@_);
346 my ($expat, $element) = @_;
347}
348
349# Called just before start or end tags with accumulated non-markup text in
350# the $_ variable.
351sub xml_text {
352 my $self = shift(@_);
353 my ($expat) = @_;
354}
355
356# Called for processing instructions. The $_ variable will contain a copy
357# of the pi.
358sub xml_pi {
359 my $self = shift(@_);
360 my ($expat, $target, $data) = @_;
361}
362
363# Called at the end of the XML document.
364sub xml_end_document {
365 my $self = shift(@_);
366 my ($expat) = @_;
367
368 $self->close_document();
369}
370
371# Called for any characters not handled by the above functions.
372sub xml_default {
373 my $self = shift(@_);
374 my ($expat, $text) = @_;
375}
376
377sub open_document {
378 my $self = shift(@_);
379
380 my $metadata = $self->{'metadata'};
381 my $filename_full_path = $self->{'filename'};
382
383 # create a new document
384 my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
385
386 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
387
388 my $filename_no_path = $self->{'filename_no_path'};
389 my $plugin_filename_encoding = $self->{'filename_encoding'};
390 my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding);
391
392 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
393
394 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
395}
396
397sub close_document {
398 my $self = shift(@_);
399 my $doc_obj = $self->{'doc_obj'};
400
401 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
402
403 # include any metadata passed in from previous plugins
404 # note that this metadata is associated with the top level section
405 $self->extra_metadata ($doc_obj,
406 $doc_obj->get_top_section(),
407 $self->{'metadata'});
408
409 # do any automatic metadata extraction
410 $self->auto_extract_metadata ($doc_obj);
411
412 # add an OID
413 $self->add_OID($doc_obj);
414
415 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
416 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
417
418 # process the document
419 $self->{'processor'}->process($doc_obj);
420
421 $self->{'num_processed'} ++;
422 undef $self->{'doc_obj'};
423 undef $doc_obj; # is this the same as above??
424}
425
4261;
427
428
429
430
Note: See TracBrowser for help on using the repository browser.