source: gsdl/trunk/perllib/plugins/ReadXMLFile.pm@ 17026

Last change on this file since 17026 was 17026, checked in by kjdon, 16 years ago

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis...)

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @ReadXMLFile::ISA = ('BasePlugin');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{ReadXMLFile.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
65
66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
67
68 if ($self->{'info_only'}) {
69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
71 return bless $self, $class;
72 }
73
74 my $parser = new XML::Parser('Style' => 'Stream',
75 'Pkg' => 'ReadXMLFile',
76 'PluginObj' => $self,
77 'Namespaces' => 1, # strip out namespaces
78 'Handlers' => {'Char' => \&Char,
79 'XMLDecl' => \&XMLDecl,
80 'Entity' => \&Entity,
81 'Doctype' => \&Doctype,
82 'Default' => \&Default
83 });
84
85 $self->{'parser'} = $parser;
86
87 return bless $self, $class;
88}
89
90# the inheriting class must implement this method to tell whether to parse this doc type
91sub get_doctype {
92 my $self = shift(@_);
93 die "$self The inheriting class must implement get_doctype method";
94}
95
96
97sub apply_xslt
98{
99 my $self = shift @_;
100 my ($xslt,$filename) = @_;
101
102 my $outhandle = $self->{'outhandle'};
103
104 my $xslt_filename = $xslt;
105
106 if (! -e $xslt_filename) {
107 # Look in main site directory
108 my $gsdlhome = $ENV{'GSDLHOME'};
109 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
110 }
111
112 if (! -e $xslt_filename) {
113 # Look in collection directory
114 my $coldir = $ENV{'GSDLCOLLECTDIR'};
115 $xslt_filename = &util::filename_cat($coldir,$xslt);
116 }
117
118 if (! -e $xslt_filename) {
119 print $outhandle "Warning: Unable to find XSLT $xslt\n";
120 if (open(XMLIN,"<$filename")) {
121
122 my $untransformed_xml = "";
123 while (defined (my $line = <XMLIN>)) {
124
125 $untransformed_xml .= $line;
126 }
127 close(XMLIN);
128
129 return $untransformed_xml;
130 }
131 else {
132 print $outhandle "Error: Unable to open file $filename\n";
133 print $outhandle " $!\n";
134 return "";
135 }
136
137 }
138
139 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
140 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
141 my $xslt_base_cmd = "java -jar $jar_filename";
142 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
143
144 my $transformed_xml = "";
145
146 if (open(XSLT_IN,"$xslt_cmd |")) {
147 while (defined (my $line = <XSLT_IN>)) {
148
149 $transformed_xml .= $line;
150 }
151 close(XSLT_IN);
152 }
153 else {
154 print $outhandle "Error: Unable to run command $xslt_cmd\n";
155 print $outhandle " $!\n";
156 }
157
158 return $transformed_xml;
159
160}
161
162sub can_process_this_file {
163 my $self = shift(@_);
164 my ($filename) = @_;
165
166 if ($self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
167 return 1; # its a file for us
168 }
169 return 0;
170}
171
172sub check_doctype {
173 my $self = shift (@_);
174
175 my ($filename) = @_;
176
177 if (open(XMLIN,"<$filename")) {
178 my $doctype = $self->get_doctype();
179 ## check whether the doctype has the same name as the root element tag
180 while (defined (my $line = <XMLIN>)) {
181 ## find the root element
182 if ($line =~ /<([\w\d:]+)[\s>]/){
183 my $root = $1;
184 if ($root !~ $doctype){
185 close(XMLIN);
186 return 0;
187 }
188 else {
189 close(XMLIN);
190 return 1;
191 }
192 }
193 }
194 close(XMLIN);
195 }
196
197 return undef; # haven't found a valid line
198
199}
200
201sub read {
202 my $self = shift (@_);
203
204 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
205
206 # can we process this file??
207 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
208 return undef unless $self->can_process_this_file($filename_full_path);
209
210 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
211 $self->{'base_dir'} = $base_dir;
212 $self->{'file'} = $file;
213 $self->{'filename'} = $filename_full_path;
214 $self->{'processor'} = $processor;
215 $self->{'metadata'} = $metadata;
216
217 eval {
218 my $xslt = $self->{'xslt'};
219 if (defined $xslt && ($xslt ne "")) {
220 # perform xslt
221 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
222
223 # feed transformed file (now in memory as string) into XML parser
224 $self->{'parser'}->parse($transformed_xml);
225 }
226 else {
227 $self->{'parser'}->parsefile($filename_full_path);
228 }
229 };
230
231 if ($@) {
232
233 # parsefile may either croak somewhere in XML::Parser (e.g. because
234 # the document is not well formed) or die somewhere in ReadXMLFile or a
235 # derived plugin (e.g. because we're attempting to process a
236 # document whose DOCTYPE is not meant for this plugin). For the
237 # first case we'll print a warning and continue, for the second
238 # we'll just continue quietly
239
240 print STDERR "**** Error is: $@\n";
241
242 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
243 if (defined $msg) {
244 my $outhandle = $self->{'outhandle'};
245 my $plugin_name = ref ($self);
246 print $outhandle "$plugin_name failed to process $file ($msg)\n";
247 }
248
249 # reset ourself for the next document
250 $self->{'section_level'}=0;
251 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
252 return -1; # error during processing
253 }
254
255
256 return 1; # processed the file
257}
258
259
260sub get_default_process_exp {
261 my $self = shift (@_);
262
263 return q^(?i)\.xml$^;
264}
265
266sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
267sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
268sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
269sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
270sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
271sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
272sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
273sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
274sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
275sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
276
277# This Char function overrides the one in XML::Parser::Stream to overcome a
278# problem where $expat->{Text} is treated as the return value, slowing
279# things down significantly in some cases.
280sub Char {
281 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
282 $_[0]->{'Text'} .= $_[1];
283 return undef;
284}
285
286# Called at the beginning of the XML document.
287sub xml_start_document {
288 my $self = shift(@_);
289 my ($expat) = @_;
290
291 $self->open_document();
292}
293
294# Called for XML declarations
295sub xml_xmldecl {
296 my $self = shift(@_);
297 my ($expat, $version, $encoding, $standalone) = @_;
298}
299
300# Called for XML entities
301sub xml_entity {
302 my $self = shift(@_);
303 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
304}
305
306# Called for DOCTYPE declarations - use die to bail out if this doctype
307# is not meant for this plugin
308sub xml_doctype {
309 my $self = shift(@_);
310
311 my ($expat, $name, $sysid, $pubid, $internal) = @_;
312 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
313}
314
315
316# Called for every start tag. The $_ variable will contain a copy of the
317# tag and the %_ variable will contain the element's attributes.
318sub xml_start_tag {
319 my $self = shift(@_);
320 my ($expat, $element) = @_;
321}
322
323# Called for every end tag. The $_ variable will contain a copy of the tag.
324sub xml_end_tag {
325 my $self = shift(@_);
326 my ($expat, $element) = @_;
327}
328
329# Called just before start or end tags with accumulated non-markup text in
330# the $_ variable.
331sub xml_text {
332 my $self = shift(@_);
333 my ($expat) = @_;
334}
335
336# Called for processing instructions. The $_ variable will contain a copy
337# of the pi.
338sub xml_pi {
339 my $self = shift(@_);
340 my ($expat, $target, $data) = @_;
341}
342
343# Called at the end of the XML document.
344sub xml_end_document {
345 my $self = shift(@_);
346 my ($expat) = @_;
347
348 $self->close_document();
349}
350
351# Called for any characters not handled by the above functions.
352sub xml_default {
353 my $self = shift(@_);
354 my ($expat, $text) = @_;
355}
356
357sub open_document {
358 my $self = shift(@_);
359
360 # create a new document
361 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
362 $self->{'doc_obj'}->add_utf8_metadata($self->{'doc_obj'}->get_top_section(), "Plugin", "$self->{'plugin_type'}");
363
364 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
365}
366
367sub close_document {
368 my $self = shift(@_);
369 my $doc_obj = $self->{'doc_obj'};
370
371 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
372
373 # include any metadata passed in from previous plugins
374 # note that this metadata is associated with the top level section
375 $self->extra_metadata ($doc_obj,
376 $doc_obj->get_top_section(),
377 $self->{'metadata'});
378
379 # do any automatic metadata extraction
380 $self->auto_extract_metadata ($doc_obj);
381
382 # add an OID
383 $self->add_OID($doc_obj);
384
385 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
386 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
387
388 # process the document
389 $self->{'processor'}->process($doc_obj);
390
391 $self->{'num_processed'} ++;
392 undef $self->{'doc_obj'};
393 undef $doc_obj; # is this the same as above??
394}
395
3961;
397
398
399
400
Note: See TracBrowser for help on using the repository browser.