source: gsdl/trunk/perllib/plugins/MetadataXMLPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91use BasePlugin;
92use util;
93use metadatautil;
94
95sub BEGIN {
96 @MetadataXMLPlugin::ISA = ('BasePlugin');
97 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
98}
99
100use XMLParser;
101
102my $arguments = [
103 { 'name' => "block_exp",
104 'desc' => "{BasePlugin.block_exp}",
105 'type' => "regexp",
106 'reqd' => "no",
107 'deft' => &get_default_block_exp() }
108];
109
110my $options = { 'name' => "MetadataXMLPlugin",
111 'desc' => "{MetadataXMLPlugin.desc}",
112 'abstract' => "no",
113 'inherits' => "yes",
114 'args' => $arguments };
115
116my ($self);
117
118sub new {
119 my ($class) = shift (@_);
120 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
121 push(@$pluginlist, $class);
122
123 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
124 push(@{$hashArgOptLists->{"OptList"}},$options);
125
126 $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
127
128 if ($self->{'info_only'}) {
129 # don't worry about any options or initialisations etc
130 return bless $self, $class;
131 }
132
133 # create XML::Parser object for parsing metadata.xml files
134 my $parser;
135 if ($]<5.008) {
136 # Perl 5.6
137 $parser = new XML::Parser('Style' => 'Stream',
138 'Handlers' => {'Char' => \&Char,
139 'Doctype' => \&Doctype
140 });
141 }
142 else {
143 # Perl 5.8
144 $parser = new XML::Parser('Style' => 'Stream',
145 'ProtocolEncoding' => 'ISO-8859-1',
146 'Handlers' => {'Char' => \&Char,
147 'Doctype' => \&Doctype
148 });
149 }
150
151 $self->{'parser'} = $parser;
152 $self->{'in_filename'} = 0;
153
154
155 return bless $self, $class;
156}
157
158
159sub get_default_process_exp
160{
161 return q^metadata\.xml$^;
162}
163
164# We don't want any other plugins to see metadata.xml files
165# block exp are currently only used in the read bit
166sub get_default_block_exp
167{
168 return q^metadata\.xml$^;
169}
170
171sub metadata_read
172{
173 my $self = shift (@_);
174 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata,
175$processor, $maxdocs, $gli) = @_;
176
177 my $filename = &util::filename_cat($base_dir, $file);
178 if ($filename !~ /metadata\.xml$/ || !-f $filename) {
179 return undef;
180 }
181
182 print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
183 print STDERR "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
184
185 $self->{'metadataref'} = $extrametadata;
186 $self->{'metakeysref'} = $extrametakeys;
187
188 eval {
189 $self->{'parser'}->parsefile($filename);
190 };
191
192 if ($@) {
193 my $outhandle = $self->{'outhandle'};
194 my $plugin_name = ref ($self);
195 print $outhandle "$plugin_name failed to process $file ($@)\n";
196
197 return -1; #error
198 }
199 return 1;
200
201}
202
203sub Doctype {
204 my ($expat, $name, $sysid, $pubid, $internal) = @_;
205
206 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
207 # to be processed as well as the "DirectoryMetadata" files which should now
208 # be created by import.pl
209 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
210}
211
212sub StartTag {
213 my ($expat, $element) = @_;
214
215 if ($element eq "FileSet") {
216 $self->{'saved_targets'} = [];
217 $self->{'saved_metadata'} = {};
218 }
219 elsif ($element eq "FileName") {
220 $self->{'in_filename'} = 1;
221 }
222 elsif ($element eq "Metadata") {
223 $self->{'metadata_name'} = $_{'name'};
224 $self->{'metadata_value'} = "";
225 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
226 $self->{'metadata_accumulate'} = 1;
227 } else {
228 $self->{'metadata_accumulate'} = 0;
229 }
230 }
231}
232
233sub EndTag {
234 my ($expat, $element) = @_;
235
236 if ($element eq "FileSet") {
237 foreach my $target (@{$self->{'saved_targets'}}) {
238 my $file_metadata = $self->{'metadataref'}->{$target};
239 my $saved_metadata = $self->{'saved_metadata'};
240 if (!defined $file_metadata) {
241 $self->{'metadataref'}->{$target} = $saved_metadata;
242
243 # not had target before
244 push (@{$self->{'metakeysref'}}, $target);
245 }
246 else {
247 &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
248 }
249 }
250 }
251 elsif ($element eq "FileName") {
252 $self->{'in_filename'} = 0;
253 }
254 elsif ($element eq "Metadata") {
255 &metadatautil::store_saved_metadata($self,$self->{'metadata_name'}, $self->{'metadata_value'}, $self->{'metadata_accumulate'});
256 $self->{'metadata_name'} = "";
257 }
258
259}
260
261sub Text {
262
263 if ($self->{'in_filename'}) {
264 # $_ == FileName content
265 push (@{$self->{'saved_targets'}}, $_);
266 }
267 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
268 # $_ == Metadata content
269 $self->{'metadata_value'} = $_;
270 }
271}
272
273# This Char function overrides the one in XML::Parser::Stream to overcome a
274# problem where $expat->{Text} is treated as the return value, slowing
275# things down significantly in some cases.
276sub Char {
277 if ($]<5.008) {
278 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
279 }
280 $_[0]->{'Text'} .= $_[1];
281 return undef;
282}
283
284#sub combine_metadata_structures
285#{
286# my $self = shift(@_);
287#
288# my ($mdref1, $mdref2) = @_;
289# &metadatautil::combine_metadata_structures($mdref1, $mdref2);
290#}
291
292
2931;
Note: See TracBrowser for help on using the repository browser.