source: main/trunk/greenstone2/perllib/plugins/ReadXMLFile.pm@ 28836

Last change on this file since 28836 was 28836, checked in by ak19, 7 years ago

A question on the mailing list involved accented characters in custom metadata set names (not metadata set values). This exposed an issue in greenstone that could not cope with utf8 characters in metaset names. The cause was the sub Char { use bytes; ... lines when reading XML. These needed to be commented out in both MetadataXMLPlugin and ReadXMLFile (as GreenstoneXMLPlugin inherits from ReadXMLFile). Doing so showed that extra Encode::decode() operations to decode strings read in from XML into utf8 were no longer needed. As a result MetaXMLPlug and GreenstoneXMLPlug no longer call decode on the metadaname name and value read in from XML, or for the full-text, since GreenstoneXMLPlugin in entirety now no longer does the 'use bytes' part. Tested with text and html collections where metadata set nanes created in custom .mds files, their assigned metadata values and a document's full-text all contained the utf-8 specific character of a-macron.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1###########################################################################
2#
3# ReadXMLFile.pm -- base class for XML plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadXMLFile;
27
28use BasePlugin;
29use doc;
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33sub BEGIN {
34 @ReadXMLFile::ISA = ('BasePlugin');
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
36}
37
38use XMLParser;
39
40my $arguments =
41 [ { 'name' => "process_exp",
42 'desc' => "{BasePlugin.process_exp}",
43 'type' => "regexp",
44 'deft' => &get_default_process_exp(),
45 'reqd' => "no" },
46 { 'name' => "xslt",
47 'desc' => "{ReadXMLFile.xslt}",
48 'type' => "string",
49 'deft' => "",
50 'reqd' => "no" } ];
51
52my $options = { 'name' => "ReadXMLFile",
53 'desc' => "{ReadXMLFile.desc}",
54 'abstract' => "yes",
55 'inherits' => "yes",
56 'args' => $arguments };
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
64 push(@{$hashArgOptLists->{"OptList"}},$options);
65
66 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
67
68 if ($self->{'info_only'}) {
69 # don't worry about creating the XML parser as all we want is the
70 # list of plugin options
71 return bless $self, $class;
72 }
73
74 my $parser = new XML::Parser('Style' => 'Stream',
75 'Pkg' => 'ReadXMLFile',
76 'PluginObj' => $self,
77 'Handlers' => {'Char' => \&Char,
78 'XMLDecl' => \&XMLDecl,
79 'Entity' => \&Entity,
80 'Doctype' => \&Doctype,
81 'Default' => \&Default
82 });
83
84 $self->{'parser'} = $parser;
85
86 return bless $self, $class;
87}
88
89# the inheriting class must implement this method to tell whether to parse this doc type
90sub get_doctype {
91 my $self = shift(@_);
92 die "$self The inheriting class must implement get_doctype method";
93}
94
95
96sub apply_xslt
97{
98 my $self = shift @_;
99 my ($xslt,$filename) = @_;
100
101 my $outhandle = $self->{'outhandle'};
102
103 my $xslt_filename = $xslt;
104
105 if (! -e $xslt_filename) {
106 # Look in main site directory
107 my $gsdlhome = $ENV{'GSDLHOME'};
108 $xslt_filename = &util::filename_cat($gsdlhome,$xslt);
109 }
110
111 if (! -e $xslt_filename) {
112 # Look in collection directory
113 my $coldir = $ENV{'GSDLCOLLECTDIR'};
114 $xslt_filename = &util::filename_cat($coldir,$xslt);
115 }
116
117 if (! -e $xslt_filename) {
118 print $outhandle "Warning: Unable to find XSLT $xslt\n";
119 if (open(XMLIN,"<$filename")) {
120
121 my $untransformed_xml = "";
122 while (defined (my $line = <XMLIN>)) {
123
124 $untransformed_xml .= $line;
125 }
126 close(XMLIN);
127
128 return $untransformed_xml;
129 }
130 else {
131 print $outhandle "Error: Unable to open file $filename\n";
132 print $outhandle " $!\n";
133 return "";
134 }
135
136 }
137
138 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
139 my $jar_filename = &util::filename_cat($bin_java,"xalan.jar");
140 my $xslt_base_cmd = "java -jar $jar_filename";
141 my $xslt_cmd = "$xslt_base_cmd -IN \"$filename\" -XSL \"$xslt_filename\"";
142
143 my $transformed_xml = "";
144
145 if (open(XSLT_IN,"$xslt_cmd |")) {
146 while (defined (my $line = <XSLT_IN>)) {
147
148 $transformed_xml .= $line;
149 }
150 close(XSLT_IN);
151 }
152 else {
153 print $outhandle "Error: Unable to run command $xslt_cmd\n";
154 print $outhandle " $!\n";
155 }
156
157 return $transformed_xml;
158
159}
160
161sub can_process_this_file {
162 my $self = shift(@_);
163 my ($filename) = @_;
164
165 if (-f $filename
166 && $self->SUPER::can_process_this_file($filename)
167 && $self->check_doctype($filename)) {
168 return 1; # its a file for us
169 }
170 return 0;
171}
172
173sub check_doctype {
174 my $self = shift (@_);
175
176 my ($filename) = @_;
177
178 if (open(XMLIN,"<$filename")) {
179 my $doctype = $self->get_doctype();
180 ## check whether the doctype has the same name as the root element tag
181 while (defined (my $line = <XMLIN>)) {
182 ## find the root element
183 if ($line =~ /<([\w\d:]+)[\s>]/){
184 my $root = $1;
185 if ($root !~ $doctype){
186 close(XMLIN);
187 return 0;
188 }
189 else {
190 close(XMLIN);
191 return 1;
192 }
193 }
194 }
195 close(XMLIN);
196 }
197
198 return undef; # haven't found a valid line
199
200}
201
202sub read {
203 my $self = shift (@_);
204
205 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
206
207 # can we process this file??
208 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
209 return undef unless $self->can_process_this_file($filename_full_path);
210
211 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
212 $self->{'base_dir'} = $base_dir;
213 $self->{'file'} = $file;
214 $self->{'filename'} = $filename_full_path;
215 $self->{'filename_no_path'} = $filename_no_path;
216 $self->{'processor'} = $processor;
217
218 # this contains metadata passed in from running metadata_read with other plugins (eg from MetadataXMLPlugin)
219 # we are also using it to store up any metadata found during parsing the XML, so that it can be added to the doc obj.
220 $self->{'metadata'} = $metadata;
221
222 if ($self->parse_file($filename_full_path)) {
223 return 1; # processed the file
224 }
225 return -1;
226}
227
228
229sub parse_file {
230 my $self = shift (@_);
231 my ($filename_full_path, $file, $gli) = @_;
232 eval {
233 my $xslt = $self->{'xslt'};
234 if (defined $xslt && ($xslt ne "")) {
235 # perform xslt
236 my $transformed_xml = $self->apply_xslt($xslt,$filename_full_path);
237
238 # feed transformed file (now in memory as string) into XML parser
239 $self->{'parser'}->parse($transformed_xml);
240 }
241 else {
242 $self->{'parser'}->parsefile($filename_full_path);
243 }
244 };
245
246 if ($@) {
247
248 # parsefile may either croak somewhere in XML::Parser (e.g. because
249 # the document is not well formed) or die somewhere in ReadXMLFile or a
250 # derived plugin (e.g. because we're attempting to process a
251 # document whose DOCTYPE is not meant for this plugin). For the
252 # first case we'll print a warning and continue, for the second
253 # we'll just continue quietly
254
255 print STDERR "**** Error is: $@\n";
256
257 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
258 if (defined $msg) {
259 my $outhandle = $self->{'outhandle'};
260 my $plugin_name = ref ($self);
261 print $outhandle "$plugin_name failed to process $file ($msg)\n";
262 }
263
264 # reset ourself for the next document
265 $self->{'section_level'}=0;
266 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
267 return -1; # error during processing
268 }
269 return 1; # parsing was successful
270}
271
272sub get_default_process_exp {
273 my $self = shift (@_);
274
275 return q^(?i)\.xml$^;
276}
277
278sub StartDocument {$_[0]->{'PluginObj'}->xml_start_document(@_);}
279sub XMLDecl {$_[0]->{'PluginObj'}->xml_xmldecl(@_);}
280sub Entity {$_[0]->{'PluginObj'}->xml_entity(@_);}
281sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
282sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
283sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
284sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
285sub PI {$_[0]->{'PluginObj'}->xml_pi(@_);}
286sub EndDocument {$_[0]->{'PluginObj'}->xml_end_document(@_);}
287sub Default {$_[0]->{'PluginObj'}->xml_default(@_);}
288
289# This Char function overrides the one in XML::Parser::Stream to overcome a
290# problem where $expat->{Text} is treated as the return value, slowing
291# things down significantly in some cases.
292sub Char {
293# use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
294 $_[0]->{'Text'} .= $_[1];
295 return undef;
296}
297
298
299# Called at the beginning of the XML document.
300sub xml_start_document {
301 my $self = shift(@_);
302 my ($expat) = @_;
303
304 $self->open_document();
305}
306
307# Called for XML declarations
308sub xml_xmldecl {
309 my $self = shift(@_);
310 my ($expat, $version, $encoding, $standalone) = @_;
311}
312
313# Called for XML entities
314sub xml_entity {
315 my $self = shift(@_);
316 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
317}
318
319# Called for DOCTYPE declarations - use die to bail out if this doctype
320# is not meant for this plugin
321sub xml_doctype {
322 my $self = shift(@_);
323
324 my ($expat, $name, $sysid, $pubid, $internal) = @_;
325 die "ReadXMLFile Cannot process XML document with DOCTYPE of $name";
326}
327
328
329# Called for every start tag. The $_ variable will contain a copy of the
330# tag and the %_ variable will contain the element's attributes.
331sub xml_start_tag {
332 my $self = shift(@_);
333 my ($expat, $element) = @_;
334}
335
336# Called for every end tag. The $_ variable will contain a copy of the tag.
337sub xml_end_tag {
338 my $self = shift(@_);
339 my ($expat, $element) = @_;
340}
341
342# Called just before start or end tags with accumulated non-markup text in
343# the $_ variable.
344sub xml_text {
345 my $self = shift(@_);
346 my ($expat) = @_;
347}
348
349# Called for processing instructions. The $_ variable will contain a copy
350# of the pi.
351sub xml_pi {
352 my $self = shift(@_);
353 my ($expat, $target, $data) = @_;
354}
355
356# Called at the end of the XML document.
357sub xml_end_document {
358 my $self = shift(@_);
359 my ($expat) = @_;
360
361 $self->close_document();
362}
363
364# Called for any characters not handled by the above functions.
365sub xml_default {
366 my $self = shift(@_);
367 my ($expat, $text) = @_;
368}
369
370sub open_document {
371 my $self = shift(@_);
372
373 my $metadata = $self->{'metadata'};
374 my $filename_full_path = $self->{'filename'};
375
376 # create a new document
377 my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
378
379 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
380
381 my $filename_no_path = $self->{'filename_no_path'};
382 my $plugin_filename_encoding = $self->{'filename_encoding'};
383 my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding);
384
385 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
386
387 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
388}
389
390sub close_document {
391 my $self = shift(@_);
392 my $doc_obj = $self->{'doc_obj'};
393
394 # do we want other auto stuff here, see BasePlugin.read_into_doc_obj
395
396 # include any metadata passed in from previous plugins
397 # note that this metadata is associated with the top level section
398 $self->extra_metadata ($doc_obj,
399 $doc_obj->get_top_section(),
400 $self->{'metadata'});
401
402 # do any automatic metadata extraction
403 $self->auto_extract_metadata ($doc_obj);
404
405 # add an OID
406 $self->add_OID($doc_obj);
407
408 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
409 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML");
410
411 # process the document
412 $self->{'processor'}->process($doc_obj);
413
414 $self->{'num_processed'} ++;
415 undef $self->{'doc_obj'};
416 undef $doc_obj; # is this the same as above??
417}
418
4191;
420
421
422
423
Note: See TracBrowser for help on using the repository browser.