source: main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm@ 21308

Last change on this file since 21308 was 20803, checked in by kjdon, 15 years ago

adding back in the code to store which file metaata came from. used later to add metadata files into reverse lookup database - otherwise we can't tell which files are new, which are existing

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91use BasePlugin;
92use util;
93use metadatautil;
94
95sub BEGIN {
96 @MetadataXMLPlugin::ISA = ('BasePlugin');
97 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
98}
99
100use XMLParser;
101
102my $arguments = [
103 { 'name' => "process_exp",
104 'desc' => "{BasePlugin.process_exp}",
105 'type' => "regexp",
106 'reqd' => "no",
107 'deft' => &get_default_process_exp() }
108
109];
110
111my $options = { 'name' => "MetadataXMLPlugin",
112 'desc' => "{MetadataXMLPlugin.desc}",
113 'abstract' => "no",
114 'inherits' => "yes",
115 'args' => $arguments };
116
117my ($self);
118
119sub new {
120 my ($class) = shift (@_);
121 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
122 push(@$pluginlist, $class);
123
124 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
125 push(@{$hashArgOptLists->{"OptList"}},$options);
126
127 $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
128
129 if ($self->{'info_only'}) {
130 # don't worry about any options or initialisations etc
131 return bless $self, $class;
132 }
133
134 # create XML::Parser object for parsing metadata.xml files
135 my $parser;
136 if ($]<5.008) {
137 # Perl 5.6
138 $parser = new XML::Parser('Style' => 'Stream',
139 'Handlers' => {'Char' => \&Char,
140 'Doctype' => \&Doctype
141 });
142 }
143 else {
144 # Perl 5.8
145 $parser = new XML::Parser('Style' => 'Stream',
146 'ProtocolEncoding' => 'ISO-8859-1',
147 'Handlers' => {'Char' => \&Char,
148 'Doctype' => \&Doctype
149 });
150 }
151
152 $self->{'parser'} = $parser;
153 $self->{'in_filename'} = 0;
154
155
156 return bless $self, $class;
157}
158
159
160sub get_default_process_exp
161{
162 return q^metadata\.xml$^;
163}
164
165
166sub file_block_read {
167 my $self = shift (@_);
168 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
169
170 my $filename = &util::filename_cat($base_dir, $file);
171 if ($filename !~ /metadata\.xml$/ || !-f $filename) {
172 return undef;
173 }
174
175 $block_hash->{'metadata_files'}->{$filename} = 1;
176
177 return 1;
178}
179
180sub metadata_read
181{
182 my $self = shift (@_);
183 my ($pluginfo, $base_dir, $file, $block_hash,
184 $extrametakeys, $extrametadata,$extrametafile,
185 $processor, $maxdocs, $gli) = @_;
186
187 my $filename = &util::filename_cat($base_dir, $file);
188 if ($filename !~ /metadata\.xml$/ || !-f $filename) {
189 return undef;
190 }
191
192 $self->{'metadata-file'} = $file;
193 $self->{'metadata-filename'} = $filename;
194
195
196 my $outhandle = $self->{'outhandle'};
197
198 print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
199 print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
200 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
201 $block_hash->{'file_blocks'}->{$filename} = 1;
202
203 $self->{'metadataref'} = $extrametadata;
204 $self->{'metafileref'} = $extrametafile;
205 $self->{'metakeysref'} = $extrametakeys;
206
207 eval {
208 $self->{'parser'}->parsefile($filename);
209 };
210
211 if ($@) {
212 my $plugin_name = ref ($self);
213 print $outhandle "$plugin_name failed to process $file ($@)\n";
214
215 return -1; #error
216 }
217
218 return 1;
219
220}
221
222sub Doctype {
223 my ($expat, $name, $sysid, $pubid, $internal) = @_;
224
225 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
226 # to be processed as well as the "DirectoryMetadata" files which should now
227 # be created by import.pl
228 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
229}
230
231sub StartTag {
232 my ($expat, $element) = @_;
233
234 if ($element eq "FileSet") {
235 $self->{'saved_targets'} = [];
236 $self->{'saved_metadata'} = {};
237 }
238 elsif ($element eq "FileName") {
239 $self->{'in_filename'} = 1;
240 }
241 elsif ($element eq "Metadata") {
242 $self->{'metadata_name'} = $_{'name'};
243 $self->{'metadata_value'} = "";
244 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
245 $self->{'metadata_accumulate'} = 1;
246 } else {
247 $self->{'metadata_accumulate'} = 0;
248 }
249 }
250}
251
252sub EndTag {
253 my ($expat, $element) = @_;
254
255 if ($element eq "FileSet") {
256 foreach my $target (@{$self->{'saved_targets'}}) {
257 my $file_metadata = $self->{'metadataref'}->{$target};
258 my $saved_metadata = $self->{'saved_metadata'};
259
260 if (!defined $file_metadata) {
261 $self->{'metadataref'}->{$target} = $saved_metadata;
262
263 # not had target before
264 push (@{$self->{'metakeysref'}}, $target);
265 }
266 else {
267 &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
268 }
269
270
271 # now record which metadata.xml file it came from
272
273 my $file = $self->{'metadata-file'};
274 my $filename = $self->{'metadata-filename'};
275
276 if (!defined $self->{'metafileref'}->{$target}) {
277 $self->{'metafileref'}->{$target} = {};
278 }
279
280 $self->{'metafileref'}->{$target}->{$file} = $filename
281 }
282 }
283 elsif ($element eq "FileName") {
284 $self->{'in_filename'} = 0;
285 }
286 elsif ($element eq "Metadata") {
287 &metadatautil::store_saved_metadata($self,$self->{'metadata_name'}, $self->{'metadata_value'}, $self->{'metadata_accumulate'});
288 $self->{'metadata_name'} = "";
289 }
290
291}
292
293sub Text {
294
295 if ($self->{'in_filename'}) {
296 # $_ == FileName content
297 push (@{$self->{'saved_targets'}}, $_);
298 }
299 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
300 # $_ == Metadata content
301 $self->{'metadata_value'} = $_;
302 }
303}
304
305# This Char function overrides the one in XML::Parser::Stream to overcome a
306# problem where $expat->{Text} is treated as the return value, slowing
307# things down significantly in some cases.
308sub Char {
309 if ($]<5.008) {
310 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
311 }
312 $_[0]->{'Text'} .= $_[1];
313 return undef;
314}
315
316#sub combine_metadata_structures
317#{
318# my $self = shift(@_);
319#
320# my ($mdref1, $mdref2) = @_;
321# &metadatautil::combine_metadata_structures($mdref1, $mdref2);
322#}
323
324
3251;
Note: See TracBrowser for help on using the repository browser.