source: gsdl/trunk/perllib/plugins/ReadXMLFile.pm@ 18528

Last change on this file since 18528 was 18528, checked in by davidb, 15 years ago

OIDmetadata wasn't supported in collect.cfg, but OIDtype was. Now rectified. Also introduced OIDcount as a file saved in the archives folder to help doc.pm use the correct value when working incrementally

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @ReadXMLFile::ISA = ('BasePlugin');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{ReadXMLFile.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
65
66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
67
68 if ($self->{'info_only'}) {
69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
71 return bless $self, $class;
72 }
73
74 my $parser = new XML::Parser('Style' => 'Stream',
75 'Pkg' => 'ReadXMLFile',
76 'PluginObj' => $self,
77 'Namespaces' => 1, # strip out namespaces
78 'Handlers' => {'Char' => \&Char,
79 'XMLDecl' => \&XMLDecl,
80 'Entity' => \&Entity,
81 'Doctype' => \&Doctype,
82 'Default' => \&Default
83 });
84
85 $self->{'parser'} = $parser;
86
87 return bless $self, $class;
88}
89
90# the inheriting class must implement this method to tell whether to parse this doc type
91sub get_doctype {
92 my $self = shift(@_);
93 die "$self The inheriting class must implement get_doctype method";
94}
95
96
97sub apply_xslt
98{
99 my $self = shift @_;
100 my ($xslt,$filename) = @_;
101
102 my $outhandle = $self->{'outhandle'};
103
104 my $xslt_filename = $xslt;
105
106 if (! -e $xslt_filename) {
107 # Look in main site directory
108 my $gsdlhome = $ENV{'GSDLHOME'};
109 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
110 }
111
112 if (! -e $xslt_filename) {
113 # Look in collection directory
114 my $coldir = $ENV{'GSDLCOLLECTDIR'};
115 $xslt_filename = &util::filename_cat($coldir,$xslt);
116 }
117
118 if (! -e $xslt_filename) {
119 print $outhandle "Warning: Unable to find XSLT $xslt\n";
120 if (open(XMLIN,"<$filename")) {
121
122 my $untransformed_xml = "";
123 while (defined (my $line = <XMLIN>)) {
124
125 $untransformed_xml .= $line;
126 }
127 close(XMLIN);
128
129 return $untransformed_xml;
130 }
131 else {
132 print $outhandle "Error: Unable to open file $filename\n";
133 print $outhandle " $!\n";
134 return "";
135 }
136
137 }
138
139 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
140 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
141 my $xslt_base_cmd = "java -jar $jar_filename";
142 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
143
144 my $transformed_xml = "";
145
146 if (open(XSLT_IN,"$xslt_cmd |")) {
147 while (defined (my $line = <XSLT_IN>)) {
148
149 $transformed_xml .= $line;
150 }
151 close(XSLT_IN);
152 }
153 else {
154 print $outhandle "Error: Unable to run command $xslt_cmd\n";
155 print $outhandle " $!\n";
156 }
157
158 return $transformed_xml;
159
160}
161
162sub can_process_this_file {
163 my $self = shift(@_);
164 my ($filename) = @_;
165
166 if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
167 return 1; # its a file for us
168 }
169 return 0;
170}
171
172sub check_doctype {
173 my $self = shift (@_);
174
175 my ($filename) = @_;
176
177 if (open(XMLIN,"<$filename")) {
178 my $doctype = $self->get_doctype();
179 ## check whether the doctype has the same name as the root element tag
180 while (defined (my $line = <XMLIN>)) {
181 ## find the root element
182 if ($line =~ /<([\w\d:]+)[\s>]/){
183 my $root = $1;
184 if ($root !~ $doctype){
185 close(XMLIN);
186 return 0;
187 }
188 else {
189 close(XMLIN);
190 return 1;
191 }
192 }
193 }
194 close(XMLIN);
195 }
196
197 return undef; # haven't found a valid line
198
199}
200
201sub read {
202 my $self = shift (@_);
203
204 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
205
206 # can we process this file??
207 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
208 return undef unless $self->can_process_this_file($filename_full_path);
209
210 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
211 $self->{'base_dir'} = $base_dir;
212 $self->{'file'} = $file;
213 $self->{'filename'} = $filename_full_path;
214 $self->{'processor'} = $processor;
215 # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
216 # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
217 $self->{'metadata'} = $metadata;
218
219 if ($self->parse_file($filename_full_path)) {
220 return 1; # processed the file
221 }
222 return -1;
223}
224
225
226sub parse_file {
227 my $self = shift (@_);
228 my ($filename_full_path, $file, $gli) = @_;
229 eval {
230 my $xslt = $self->{'xslt'};
231 if (defined $xslt && ($xslt ne "")) {
232 # perform xslt
233 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
234
235 # feed transformed file (now in memory as string) into XML parser
236 $self->{'parser'}->parse($transformed_xml);
237 }
238 else {
239 $self->{'parser'}->parsefile($filename_full_path);
240 }
241 };
242
243 if ($@) {
244
245 # parsefile may either croak somewhere in XML::Parser (e.g. because
246 # the document is not well formed) or die somewhere in ReadXMLFile or a
247 # derived plugin (e.g. because we're attempting to process a
248 # document whose DOCTYPE is not meant for this plugin). For the
249 # first case we'll print a warning and continue, for the second
250 # we'll just continue quietly
251
252 print STDERR "**** Error is: $@\n";
253
254 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
255 if (defined $msg) {
256 my $outhandle = $self->{'outhandle'};
257 my $plugin_name = ref ($self);
258 print $outhandle "$plugin_name failed to process $file ($msg)\n";
259 }
260
261 # reset ourself for the next document
262 $self->{'section_level'}=0;
263 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
264 return -1; # error during processing
265 }
266 return 1; # parsing was successful
267}
268
269sub get_default_process_exp {
270 my $self = shift (@_);
271
272 return q^(?i)\.xml$^;
273}
274
275sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
276sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
277sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
278sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
279sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
280sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
281sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
282sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
283sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
284sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
285
286# This Char function overrides the one in XML::Parser::Stream to overcome a
287# problem where $expat->{Text} is treated as the return value, slowing
288# things down significantly in some cases.
289sub Char {
290 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
291 $_[0]->{'Text'} .= $_[1];
292 return undef;
293}
294
295
296# Called at the beginning of the XML document.
297sub xml_start_document {
298 my $self = shift(@_);
299 my ($expat) = @_;
300
301 $self->open_document();
302}
303
304# Called for XML declarations
305sub xml_xmldecl {
306 my $self = shift(@_);
307 my ($expat, $version, $encoding, $standalone) = @_;
308}
309
310# Called for XML entities
311sub xml_entity {
312 my $self = shift(@_);
313 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
314}
315
316# Called for DOCTYPE declarations - use die to bail out if this doctype
317# is not meant for this plugin
318sub xml_doctype {
319 my $self = shift(@_);
320
321 my ($expat, $name, $sysid, $pubid, $internal) = @_;
322 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
323}
324
325
326# Called for every start tag. The $_ variable will contain a copy of the
327# tag and the %_ variable will contain the element's attributes.
328sub xml_start_tag {
329 my $self = shift(@_);
330 my ($expat, $element) = @_;
331}
332
333# Called for every end tag. The $_ variable will contain a copy of the tag.
334sub xml_end_tag {
335 my $self = shift(@_);
336 my ($expat, $element) = @_;
337}
338
339# Called just before start or end tags with accumulated non-markup text in
340# the $_ variable.
341sub xml_text {
342 my $self = shift(@_);
343 my ($expat) = @_;
344}
345
346# Called for processing instructions. The $_ variable will contain a copy
347# of the pi.
348sub xml_pi {
349 my $self = shift(@_);
350 my ($expat, $target, $data) = @_;
351}
352
353# Called at the end of the XML document.
354sub xml_end_document {
355 my $self = shift(@_);
356 my ($expat) = @_;
357
358 $self->close_document();
359}
360
361# Called for any characters not handled by the above functions.
362sub xml_default {
363 my $self = shift(@_);
364 my ($expat, $text) = @_;
365}
366
367sub open_document {
368 my $self = shift(@_);
369
370 # create a new document
371 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
372 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
373
374 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
375}
376
377sub close_document {
378 my $self = shift(@_);
379 my $doc_obj = $self->{'doc_obj'};
380
381 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
382
383 # include any metadata passed in from previous plugins
384 # note that this metadata is associated with the top level section
385 $self->extra_metadata ($doc_obj,
386 $doc_obj->get_top_section(),
387 $self->{'metadata'});
388
389 # do any automatic metadata extraction
390 $self->auto_extract_metadata ($doc_obj);
391
392 # add an OID
393 $self->add_OID($doc_obj);
394
395 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
396 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
397
398 # process the document
399 $self->{'processor'}->process($doc_obj);
400
401 $self->{'num_processed'} ++;
402 undef $self->{'doc_obj'};
403 undef $doc_obj; # is this the same as above??
404}
405
4061;
407
408
409
410
Note: See TracBrowser for help on using the repository browser.