source: main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm@ 23419

Last change on this file since 23419 was 23419, checked in by max, 13 years ago

Setting the values to store as block files is now done through an API call to BasePlugin. This way, anything uniform requirement (such as putting in both C:\... and c:\... entries for Windows) can be done in one place.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34# <FileSet>
35# <FileName>nugget.*</FileName>
36# <Description>
37# <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38# <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39# </Description>
40# </FileSet>
41# <FileSet>
42# <FileName>nugget-point-1.jpg</FileName>
43# <Description>
44# <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45# <Metadata name="Subject">Lighthouse</Metadata>
46# </Description>
47# </FileSet>
48# <FileSet>
49# <FileName>kaka-point-dir</FileName>
50# <Description>
51# <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52# </Description>
53# </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget". For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element. In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values. If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead. Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg. This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file. The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91
92use Encode;
93
94use BasePlugin;
95use util;
96use metadatautil;
97
98sub BEGIN {
99 @MetadataXMLPlugin::ISA = ('BasePlugin');
100 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
101}
102
103use XMLParser;
104
105my $arguments = [
106 { 'name' => "process_exp",
107 'desc' => "{BasePlugin.process_exp}",
108 'type' => "regexp",
109 'reqd' => "no",
110 'deft' => &get_default_process_exp() }
111
112];
113
114my $options = { 'name' => "MetadataXMLPlugin",
115 'desc' => "{MetadataXMLPlugin.desc}",
116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120sub new {
121 my ($class) = shift (@_);
122 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123 push(@$pluginlist, $class);
124
125 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
126 push(@{$hashArgOptLists->{"OptList"}},$options);
127
128 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
129
130 if ($self->{'info_only'}) {
131 # don't worry about any options or initialisations etc
132 return bless $self, $class;
133 }
134
135 # create XML::Parser object for parsing metadata.xml files
136 my $parser;
137 if ($]<5.008) {
138 # Perl 5.6
139 $parser = new XML::Parser('Style' => 'Stream',
140 'PluginObj' => $self,
141 'Handlers' => {'Char' => \&Char,
142 'Doctype' => \&Doctype
143 });
144 }
145 else {
146 # Perl 5.8 or greater
147 $parser = new XML::Parser('Style' => 'Stream',
148 'PluginObj' => $self,
149 'ProtocolEncoding' => 'ISO-8859-1',
150 'Handlers' => {'Char' => \&Char,
151 'Doctype' => \&Doctype
152 });
153 }
154
155 $self->{'parser'} = $parser;
156 $self->{'in_filename'} = 0;
157
158
159 return bless $self, $class;
160}
161
162
163sub get_default_process_exp
164{
165 return q^metadata\.xml$^;
166}
167
168sub get_doctype {
169 my $self = shift(@_);
170
171 return "(Greenstone)?DirectoryMetadata"
172}
173
174sub can_process_this_file {
175 my $self = shift(@_);
176 my ($filename) = @_;
177
178 if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
179 return 1; # its a file for us
180 }
181 return 0;
182}
183
184sub check_doctype {
185 my $self = shift (@_);
186
187 my ($filename) = @_;
188
189 if (open(XMLIN,"<$filename")) {
190 my $doctype = $self->get_doctype();
191 ## check whether the doctype has the same name as the root element tag
192 while (defined (my $line = <XMLIN>)) {
193 ## find the root element
194 if ($line =~ /<([\w\d:]+)[\s>]/){
195 my $root = $1;
196 if ($root !~ $doctype){
197 close(XMLIN);
198 return 0;
199 }
200 else {
201 close(XMLIN);
202 return 1;
203 }
204 }
205 }
206 close(XMLIN);
207 }
208
209 return undef; # haven't found a valid line
210
211}
212
213sub file_block_read {
214 my $self = shift (@_);
215 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
216
217 my $filename_full_path = &util::filename_cat($base_dir, $file);
218 return undef unless $self->can_process_this_file($filename_full_path);
219
220 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
221
222 my $lower_drive = $filename_full_path;
223 $lower_drive =~ s/^([A-Z]):/\l$1:/i;
224
225 my $upper_drive = $filename_full_path;
226 $upper_drive =~ s/^([A-Z]):/\u$1:/i;
227
228 $block_hash->{'metadata_files'}->{$lower_drive} = 1;
229 $block_hash->{'metadata_files'}->{$upper_drive} = 1;
230 }
231 else {
232 $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
233 }
234
235 return 1;
236}
237
238sub metadata_read
239{
240 my $self = shift (@_);
241 my ($pluginfo, $base_dir, $file, $block_hash,
242 $extrametakeys, $extrametadata,$extrametafile,
243 $processor, $gli, $aux) = @_;
244
245 my $filename = &util::filename_cat($base_dir, $file);
246 return undef unless $self->can_process_this_file($filename);
247
248 $self->{'metadata-file'} = $file;
249 $self->{'metadata-filename'} = $filename;
250
251 my $outhandle = $self->{'outhandle'};
252
253 print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
254 print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
255 # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
256 $self->block_filename($block_hash,$filename);
257
258 $self->{'metadataref'} = $extrametadata;
259 $self->{'metafileref'} = $extrametafile;
260 $self->{'metakeysref'} = $extrametakeys;
261
262 eval {
263 $self->{'parser'}->parsefile($filename);
264 };
265
266 if ($@) {
267 my $plugin_name = ref ($self);
268 my $failhandle = $self->{'failhandle'};
269 print $outhandle "$plugin_name failed to process $file ($@)\n";
270 print $failhandle "$plugin_name failed to process $file ($@)\n";
271 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
272 return -1; #error
273 }
274
275 return 1;
276
277}
278
279
280# Updated by Jeffrey 2010/04/16 @ DL Consulting Ltd.
281# Get rid off the global $self as it cause problems when there are 2+ MetadataXMLPlugin in your collect.cfg...
282# For example when you have an OAIMetadataXMLPlugin that is a child of MetadataXMLPlugin
283sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
284sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
285sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
286sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
287
288
289sub xml_doctype {
290 my $self = shift(@_);
291 my ($expat, $name, $sysid, $pubid, $internal) = @_;
292
293 # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
294 # to be processed as well as the "DirectoryMetadata" files which should now
295 # be created by import.pl
296 die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
297}
298
299sub xml_start_tag {
300 my $self = shift(@_);
301 my ($expat, $element) = @_;
302
303 if ($element eq "FileSet") {
304 $self->{'saved_targets'} = [];
305 $self->{'saved_metadata'} = {};
306 }
307 elsif ($element eq "FileName") {
308 $self->{'in_filename'} = 1;
309 }
310 elsif ($element eq "Metadata") {
311 $self->{'metadata_name'} = $_{'name'};
312 $self->{'metadata_value'} = "";
313 if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
314 $self->{'metadata_accumulate'} = 1;
315 } else {
316 $self->{'metadata_accumulate'} = 0;
317 }
318 }
319}
320
321sub xml_end_tag {
322 my $self = shift(@_);
323 my ($expat, $element) = @_;
324
325 if ($element eq "FileSet") {
326 foreach my $target (@{$self->{'saved_targets'}}) {
327 my $file_metadata = $self->{'metadataref'}->{$target};
328 my $saved_metadata = $self->{'saved_metadata'};
329
330 if (!defined $file_metadata) {
331 $self->{'metadataref'}->{$target} = $saved_metadata;
332
333 # not had target before
334 push (@{$self->{'metakeysref'}}, $target);
335 }
336 else {
337 &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
338 }
339
340
341 # now record which metadata.xml file it came from
342
343 my $file = $self->{'metadata-file'};
344 my $filename = $self->{'metadata-filename'};
345
346 if (!defined $self->{'metafileref'}->{$target}) {
347 $self->{'metafileref'}->{$target} = {};
348 }
349
350 $self->{'metafileref'}->{$target}->{$file} = $filename
351 }
352 }
353 elsif ($element eq "FileName") {
354 $self->{'in_filename'} = 0;
355 }
356 elsif ($element eq "Metadata") {
357 my $metadata_name = decode("utf8",$self->{'metadata_name'});
358 my $metadata_value = decode("utf8",$self->{'metadata_value'});
359
360 &metadatautil::store_saved_metadata($self,
361 $metadata_name, $metadata_value,
362 $self->{'metadata_accumulate'});
363 $self->{'metadata_name'} = "";
364 }
365
366}
367
368sub xml_text {
369 my $self = shift(@_);
370
371 if ($self->{'in_filename'}) {
372 # $_ == FileName content
373 push (@{$self->{'saved_targets'}}, $_);
374 }
375 elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
376 # $_ == Metadata content
377 $self->{'metadata_value'} = $_;
378 }
379}
380
381# This Char function overrides the one in XML::Parser::Stream to overcome a
382# problem where $expat->{Text} is treated as the return value, slowing
383# things down significantly in some cases.
384sub Char {
385 if ($]<5.008) {
386 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
387 }
388 $_[0]->{'Text'} .= $_[1];
389 return undef;
390}
391
392
393
3941;
Note: See TracBrowser for help on using the repository browser.