source: main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm@ 24060

Last change on this file since 24060 was 24060, checked in by ak19, 13 years ago

Dr Bainbridge fixed the unicode issue that appeared after the 2.84 release, when the parser instantiation got changed (by removing its ProtocolEncoding argument, which used to set this encoding to Latin-1).

  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
RevLine 
[13189]1###########################################################################
2#
[15872]3# MetadataXMLPlugin.pm --
[13189]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[15872]26# MetadataXMLPlugin process metadata.xml files in a collection
[13189]27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
[15872]87package MetadataXMLPlugin;
[13189]88
89use strict;
90no strict 'refs';
[22857]91
92use Encode;
93
[15872]94use BasePlugin;
[13189]95use util;
96use metadatautil;
97
98sub BEGIN {
[15872]99 @MetadataXMLPlugin::ISA = ('BasePlugin');
[13189]100 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
101}
102
103use XMLParser;
104
105my $arguments = [
[16386]106 { 'name' => "process_exp",
107 'desc' => "{BasePlugin.process_exp}",
[13189]108 'type' => "regexp",
109 'reqd' => "no",
[16386]110 'deft' => &get_default_process_exp() }
111
[13189]112];
113
[15872]114my $options = { 'name' => "MetadataXMLPlugin",
115 'desc' => "{MetadataXMLPlugin.desc}",
[13189]116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120sub new {
121 my ($class) = shift (@_);
122 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123 push(@$pluginlist, $class);
124
[15872]125 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
126 push(@{$hashArgOptLists->{"OptList"}},$options);
[13189]127
[21905]128 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[13189]129
130 if ($self->{'info_only'}) {
131 # don't worry about any options or initialisations etc
132 return bless $self, $class;
133 }
134
[23895]135 # The following used to be passed in as a parameter to XML::Parser,
136 # if the version of perl was greater than or equal to 5.8.
137 # The svn commit comment explaining the reason for adding this was
138 # not very clear and also said that it was quick fix and hadn't
139 # been tested under windows.
140 # More recent work has been to make strings in Perl "Unicode-aware"
141 # and so this line might actually be potentially harmful, however
142 # it is not the case that we encountered an actual error leading to
143 # its removal, rather it has been eliminated in an attempt to tighten
144 # up the code. For example, this protocol encoding is not used in
145 # ReadXMLFile.
146 # 'ProtocolEncoding' => 'ISO-8859-1',
147
[13189]148 # create XML::Parser object for parsing metadata.xml files
[24060]149 my $parser = new XML::Parser('Style' => 'Stream',
150 'Pkg' => 'MetadataXMLPlugin',
[21905]151 'PluginObj' => $self,
[24060]152 'Handlers' => {'Char' => \&Char,
[13822]153 'Doctype' => \&Doctype
154 });
155
[13189]156 $self->{'parser'} = $parser;
157 $self->{'in_filename'} = 0;
158
159
160 return bless $self, $class;
161}
162
163
164sub get_default_process_exp
165{
166 return q^metadata\.xml$^;
167}
168
[21916]169sub get_doctype {
170 my $self = shift(@_);
171
172 return "(Greenstone)?DirectoryMetadata"
173}
[13189]174
[21916]175sub can_process_this_file {
176 my $self = shift(@_);
177 my ($filename) = @_;
178
179 if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
180 return 1; # its a file for us
181 }
182 return 0;
183}
184
185sub check_doctype {
186 my $self = shift (@_);
187
188 my ($filename) = @_;
189
190 if (open(XMLIN,"<$filename")) {
191 my $doctype = $self->get_doctype();
192 ## check whether the doctype has the same name as the root element tag
193 while (defined (my $line = <XMLIN>)) {
194 ## find the root element
195 if ($line =~ /<([\w\d:]+)[\s>]/){
196 my $root = $1;
197 if ($root !~ $doctype){
198 close(XMLIN);
199 return 0;
200 }
201 else {
202 close(XMLIN);
203 return 1;
204 }
205 }
206 }
207 close(XMLIN);
208 }
209
210 return undef; # haven't found a valid line
211
212}
213
[20577]214sub file_block_read {
215 my $self = shift (@_);
216 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
[23419]217
[21916]218 my $filename_full_path = &util::filename_cat($base_dir, $file);
219 return undef unless $self->can_process_this_file($filename_full_path);
[20577]220
[23419]221 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
222
223 my $lower_drive = $filename_full_path;
224 $lower_drive =~ s/^([A-Z]):/\l$1:/i;
225
226 my $upper_drive = $filename_full_path;
227 $upper_drive =~ s/^([A-Z]):/\u$1:/i;
228
229 $block_hash->{'metadata_files'}->{$lower_drive} = 1;
230 $block_hash->{'metadata_files'}->{$upper_drive} = 1;
231 }
232 else {
233 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
234 }
[20577]235
236 return 1;
237}
238
[13189]239sub metadata_read
240{
241 my $self = shift (@_);
[19493]242 my ($pluginfo, $base_dir, $file, $block_hash,
243 $extrametakeys, $extrametadata,$extrametafile,
[23212]244 $processor, $gli, $aux) = @_;
[13189]245
246 my $filename = &util::filename_cat($base_dir, $file);
[21916]247 return undef unless $self->can_process_this_file($filename);
[24060]248
[19493]249 $self->{'metadata-file'} = $file;
250 $self->{'metadata-filename'} = $filename;
[23419]251
[16850]252 my $outhandle = $self->{'outhandle'};
253
[15872]254 print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
[16850]255 print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
[16386]256 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
[23561]257 &util::block_filename($block_hash,$filename);
[20577]258
[13189]259 $self->{'metadataref'} = $extrametadata;
[19493]260 $self->{'metafileref'} = $extrametafile;
[13189]261 $self->{'metakeysref'} = $extrametakeys;
262
263 eval {
264 $self->{'parser'}->parsefile($filename);
265 };
266
267 if ($@) {
[24060]268 print STDERR "**** Error is: $@\n";
[13189]269 my $plugin_name = ref ($self);
[22853]270 my $failhandle = $self->{'failhandle'};
[13189]271 print $outhandle "$plugin_name failed to process $file ($@)\n";
[22853]272 print $failhandle "$plugin_name failed to process $file ($@)\n";
273 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
[13189]274 return -1; #error
275 }
[16386]276
[13189]277 return 1;
278
279}
280
[21905]281
282# Updated by Jeffrey 2010/04/16 @ DL Consulting Ltd.
283# Get rid off the global $self as it cause problems when there are 2+ MetadataXMLPlugin in your collect.cfg...
284# For example when you have an OAIMetadataXMLPlugin that is a child of MetadataXMLPlugin
285sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
286sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
287sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
288sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
289
290
291sub xml_doctype {
292 my $self = shift(@_);
[13189]293 my ($expat, $name, $sysid, $pubid, $internal) = @_;
294
295 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
296 # to be processed as well as the "DirectoryMetadata" files which should now
297 # be created by import.pl
298 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
299}
300
[21905]301sub xml_start_tag {
302 my $self = shift(@_);
[13189]303 my ($expat, $element) = @_;
[24060]304
[13189]305 if ($element eq "FileSet") {
306 $self->{'saved_targets'} = [];
307 $self->{'saved_metadata'} = {};
308 }
309 elsif ($element eq "FileName") {
310 $self->{'in_filename'} = 1;
311 }
312 elsif ($element eq "Metadata") {
313 $self->{'metadata_name'} = $_{'name'};
[14955]314 $self->{'metadata_value'} = "";
[13189]315 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
316 $self->{'metadata_accumulate'} = 1;
317 } else {
318 $self->{'metadata_accumulate'} = 0;
319 }
320 }
321}
322
[21905]323sub xml_end_tag {
324 my $self = shift(@_);
[13189]325 my ($expat, $element) = @_;
326
327 if ($element eq "FileSet") {
328 foreach my $target (@{$self->{'saved_targets'}}) {
329 my $file_metadata = $self->{'metadataref'}->{$target};
330 my $saved_metadata = $self->{'saved_metadata'};
[19493]331
[13189]332 if (!defined $file_metadata) {
333 $self->{'metadataref'}->{$target} = $saved_metadata;
[15004]334
335 # not had target before
336 push (@{$self->{'metakeysref'}}, $target);
[13189]337 }
338 else {
[15004]339 &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
[13189]340 }
[19493]341
342
343 # now record which metadata.xml file it came from
344
[20803]345 my $file = $self->{'metadata-file'};
346 my $filename = $self->{'metadata-filename'};
[19493]347
[20803]348 if (!defined $self->{'metafileref'}->{$target}) {
349 $self->{'metafileref'}->{$target} = {};
350 }
[19493]351
[20803]352 $self->{'metafileref'}->{$target}->{$file} = $filename
[13189]353 }
354 }
355 elsif ($element eq "FileName") {
356 $self->{'in_filename'} = 0;
357 }
358 elsif ($element eq "Metadata") {
[24060]359 # text read in by XML::Parser is in Perl's binary byte value
360 # form ... need to explicitly make it UTF-8
[22857]361
[24060]362 my $metadata_name = decode("utf-8",$self->{'metadata_name'});
363 my $metadata_value = decode("utf-8",$self->{'metadata_value'});
364
[22857]365 &metadatautil::store_saved_metadata($self,
366 $metadata_name, $metadata_value,
367 $self->{'metadata_accumulate'});
[13189]368 $self->{'metadata_name'} = "";
369 }
370
371}
372
[21905]373sub xml_text {
374 my $self = shift(@_);
[13189]375
376 if ($self->{'in_filename'}) {
377 # $_ == FileName content
378 push (@{$self->{'saved_targets'}}, $_);
379 }
380 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
381 # $_ == Metadata content
[14955]382 $self->{'metadata_value'} = $_;
[13189]383 }
384}
385
386# This Char function overrides the one in XML::Parser::Stream to overcome a
387# problem where $expat->{Text} is treated as the return value, slowing
388# things down significantly in some cases.
389sub Char {
[24060]390 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
391
392# if ($]<5.008) {
393# use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
394# }
[13189]395 $_[0]->{'Text'} .= $_[1];
396 return undef;
397}
398
399
400
4011;
Note: See TracBrowser for help on using the repository browser.