source: gsdl/trunk/perllib/plugins/MetadataXMLPlug.pm@ 14955

Last change on this file since 14955 was 14955, checked in by mdewsnip, 16 years ago

Fixed MetadataXMLPlug.pm so empty values in metadata.xml files aren't ignored. It's sometimes important to be able to specify empty values (which are quite different from no value at all).

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# MetadataXMLPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlug process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlug;
88
89use strict;
90no strict 'refs';
91use BasPlug;
92use util;
93use metadatautil;
94
95sub BEGIN {
96 @MetadataXMLPlug::ISA = ('BasPlug');
97 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
98}
99
100use XMLParser;
101
102my $arguments = [
103 { 'name' => "block_exp",
104 'desc' => "{BasPlug.block_exp}",
105 'type' => "regexp",
106 'reqd' => "no",
107 'deft' => &get_default_block_exp() }
108];
109
110my $options = { 'name' => "MetadataXMLPlug",
111 'desc' => "{MetadataXMLPlug.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'args' => $arguments };
115
116my ($self);
117
118sub new {
119 my ($class) = shift (@_);
120 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
121 push(@$pluginlist, $class);
122
123 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
124 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
125
126 $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
127
128 if ($self->{'info_only'}) {
129 # don't worry about any options or initialisations etc
130 return bless $self, $class;
131 }
132
133 # create XML::Parser object for parsing metadata.xml files
134 my $parser;
135 if ($]<5.008) {
136 # Perl 5.6
137 $parser = new XML::Parser('Style' => 'Stream',
138 'Handlers' => {'Char' => \&Char,
139 'Doctype' => \&Doctype
140 });
141 }
142 else {
143 # Perl 5.8
144 $parser = new XML::Parser('Style' => 'Stream',
145 'ProtocolEncoding' => 'ISO-8859-1',
146 'Handlers' => {'Char' => \&Char,
147 'Doctype' => \&Doctype
148 });
149 }
150
151 $self->{'parser'} = $parser;
152 $self->{'in_filename'} = 0;
153
154
155 return bless $self, $class;
156}
157
158
159sub get_default_process_exp
160{
161 return q^metadata\.xml$^;
162}
163
164# We don't want any other plugins to see metadata.xml files
165# block exp are currently only used in the read bit
166sub get_default_block_exp
167{
168 return q^metadata\.xml$^;
169}
170
171sub metadata_read
172{
173 my $self = shift (@_);
174 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata,
175$processor, $maxdocs, $gli) = @_;
176
177 my $filename = &util::filename_cat($base_dir, $file);
178 if ($filename !~ /metadata\.xml$/ || !-f $filename) {
179 return undef;
180 }
181
182 print STDERR "\n<Processing n='$file' p='MetadataXMLPlug'>\n" if ($gli);
183 print STDERR "MetadataXMLPlug: processing $file\n" if ($self->{'verbosity'})> 1;
184
185 $self->{'metadataref'} = $extrametadata;
186 $self->{'metakeysref'} = $extrametakeys;
187
188 eval {
189 $self->{'parser'}->parsefile($filename);
190 };
191
192 if ($@) {
193 my $outhandle = $self->{'outhandle'};
194 my $plugin_name = ref ($self);
195 print $outhandle "$plugin_name failed to process $file ($@)\n";
196
197 return -1; #error
198 }
199 return 1;
200
201}
202
203sub Doctype {
204 my ($expat, $name, $sysid, $pubid, $internal) = @_;
205
206 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
207 # to be processed as well as the "DirectoryMetadata" files which should now
208 # be created by import.pl
209 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
210}
211
212sub StartTag {
213 my ($expat, $element) = @_;
214
215 if ($element eq "FileSet") {
216 $self->{'saved_targets'} = [];
217 $self->{'saved_metadata'} = {};
218 }
219 elsif ($element eq "FileName") {
220 $self->{'in_filename'} = 1;
221 }
222 elsif ($element eq "Metadata") {
223 $self->{'metadata_name'} = $_{'name'};
224 $self->{'metadata_value'} = "";
225 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
226 $self->{'metadata_accumulate'} = 1;
227 } else {
228 $self->{'metadata_accumulate'} = 0;
229 }
230 }
231}
232
233sub EndTag {
234 my ($expat, $element) = @_;
235
236 if ($element eq "FileSet") {
237 push (@{$self->{'metakeysref'}}, @{$self->{'saved_targets'}});
238 foreach my $target (@{$self->{'saved_targets'}}) {
239 my $file_metadata = $self->{'metadataref'}->{$target};
240 my $saved_metadata = $self->{'saved_metadata'};
241 if (!defined $file_metadata) {
242 $self->{'metadataref'}->{$target} = $saved_metadata;
243 }
244 else {
245 $self->combine_metadata_structures($file_metadata,$saved_metadata);
246 }
247 }
248 }
249 elsif ($element eq "FileName") {
250 $self->{'in_filename'} = 0;
251 }
252 elsif ($element eq "Metadata") {
253 $self->store_saved_metadata($self->{'metadata_name'}, $self->{'metadata_value'}, $self->{'metadata_accumulate'});
254 $self->{'metadata_name'} = "";
255 }
256
257}
258
259sub Text {
260
261 if ($self->{'in_filename'}) {
262 # $_ == FileName content
263 push (@{$self->{'saved_targets'}}, $_);
264 }
265 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
266 # $_ == Metadata content
267 $self->{'metadata_value'} = $_;
268 }
269}
270
271# This Char function overrides the one in XML::Parser::Stream to overcome a
272# problem where $expat->{Text} is treated as the return value, slowing
273# things down significantly in some cases.
274sub Char {
275 if ($]<5.008) {
276 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
277 }
278 $_[0]->{'Text'} .= $_[1];
279 return undef;
280}
281
282sub combine_metadata_structures
283{
284 my $self = shift(@_);
285
286 my ($mdref1, $mdref2) = @_;
287 &metadatautil::combine_metadata_structures($mdref1, $mdref2);
288}
289
290sub store_saved_metadata
291{
292 my $self = shift(@_);
293 my ($mname,$mvalue,$md_accumulate) = @_;
294
295 if (defined $self->{'saved_metadata'}->{$mname}) {
296 if ($md_accumulate) {
297 # accumulate mode - add value to existing value(s)
298 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
299 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
300 } else {
301 $self->{'saved_metadata'}->{$mname} =
302 [$self->{'saved_metadata'}->{$mname}, $mvalue];
303 }
304 } else {
305 # override mode
306 $self->{'saved_metadata'}->{$mname} = $mvalue;
307 }
308 } else {
309 if ($md_accumulate) {
310 # accumulate mode - add value into (currently empty) array
311 $self->{'saved_metadata'}->{$mname} = [$mvalue];
312 } else {
313 # override mode
314 $self->{'saved_metadata'}->{$mname} = $mvalue;
315 }
316 }
317}
318
319
3201;
Note: See TracBrowser for help on using the repository browser.