root/main/trunk/greenstone2/perllib/plugins/MetadataXMLPlugin.pm @ 24060

Revision 24060, 12.2 KB (checked in by ak19, 9 years ago)

Dr Bainbridge fixed the unicode issue that appeared after the 2.84 release, when the parser instantiation got changed (by removing its ProtocolEncoding? argument, which used to set this encoding to Latin-1).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# MetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# MetadataXMLPlugin process metadata.xml files in a collection
27
28# Here's an example of a metadata file that uses three FileSet structures
29# (ignore the # characters):
30
31#<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32#<!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd">
33#<DirectoryMetadata>
34#  <FileSet>
35#    <FileName>nugget.*</FileName>
36#    <Description>
37#      <Metadata name="Title">Nugget Point, The Catlins</Metadata>
38#      <Metadata name="Place" mode="accumulate">Nugget Point</Metadata>
39#    </Description>
40#  </FileSet>
41#  <FileSet>
42#    <FileName>nugget-point-1.jpg</FileName>
43#    <Description>
44#      <Metadata name="Title">Nugget Point Lighthouse, The Catlins</Metadata>
45#      <Metadata name="Subject">Lighthouse</Metadata>
46#    </Description>
47#  </FileSet>
48#  <FileSet>
49#    <FileName>kaka-point-dir</FileName>
50#    <Description>
51#      <Metadata name="Title">Kaka Point, The Catlins</Metadata>
52#    </Description>
53#  </FileSet>
54#</DirectoryMetadata>
55
56# Metadata elements are read and applied to files in the order they appear
57# in the file.
58#
59# The FileName element describes the subfiles in the directory that the
60# metadata applies to as a perl regular expression (a FileSet group may
61# contain multiple FileName elements). So, <FileName>nugget.*</FileName>
62# indicates that the metadata records in the following Description block
63# apply to every subfile that starts with "nugget".  For these files, a
64# Title metadata element is set, overriding any old value that the Title
65# might have had.
66#
67# Occasionally, we want to have multiple metadata values applied to a
68# document; in this case we use the "mode=accumulate" attribute of the
69# particular Metadata element.  In the second metadata element of the first
70# FileSet above, the "Place" metadata is accumulating, and may therefore be
71# given several values.  If we wanted to override these values and use a
72# single metadata element again, we could set the mode attribute to
73# "override" instead.  Remember: every element is assumed to be in override
74# mode unless you specify otherwise, so if you want to accumulate metadata
75# for some field, every occurance must have "mode=accumulate" specified.
76#
77# The second FileSet element above applies to a specific file, called
78# nugget-point-1.jpg.  This element overrides the Title metadata set in the
79# first FileSet, and adds a "Subject" metadata field.
80#
81# The third and final FileSet sets metadata for a subdirectory rather than
82# a file.  The metadata specified (a Title) will be passed into the
83# subdirectory and applied to every file that occurs in the subdirectory
84# (and to every subsubdirectory and its contents, and so on) unless the
85# metadata is explictly overridden later in the import.
86
87package MetadataXMLPlugin;
88
89use strict;
90no strict 'refs';
91
92use Encode;
93
94use BasePlugin;
95use util;
96use metadatautil;
97
98sub BEGIN {
99    @MetadataXMLPlugin::ISA = ('BasePlugin');
100    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
101}
102
103use XMLParser;
104
105my $arguments = [
106      { 'name' => "process_exp",
107    'desc' => "{BasePlugin.process_exp}",
108    'type' => "regexp",
109    'reqd' => "no",
110    'deft' => &get_default_process_exp() }
111
112];
113
114my $options = { 'name'     => "MetadataXMLPlugin",
115        'desc'     => "{MetadataXMLPlugin.desc}",
116        'abstract' => "no",
117        'inherits' => "yes",
118        'args'     => $arguments };
119
120sub new {
121    my ($class) = shift (@_);
122    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123    push(@$pluginlist, $class);
124
125    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
126    push(@{$hashArgOptLists->{"OptList"}},$options);
127
128    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
129
130    if ($self->{'info_only'}) {
131    # don't worry about any options or initialisations etc
132    return bless $self, $class;
133    }
134   
135    # The following used to be passed in as a parameter to XML::Parser,
136    # if the version of perl was greater than or equal to 5.8.
137    # The svn commit comment explaining the reason for adding this was
138    # not very clear and also said that it was quick fix and hadn't
139    # been tested under windows.
140    # More recent work has been to make strings in Perl "Unicode-aware"
141    # and so this line might actually be potentially harmful, however
142    # it is not the case that we encountered an actual error leading to
143    # its removal, rather it has been eliminated in an attempt to tighten
144    # up the code. For example, this protocol encoding is not used in
145    # ReadXMLFile.
146    # 'ProtocolEncoding' => 'ISO-8859-1',
147
148    # create XML::Parser object for parsing metadata.xml files
149    my $parser = new XML::Parser('Style' => 'Stream',                             
150                                  'Pkg' => 'MetadataXMLPlugin',
151                                  'PluginObj' => $self,
152                    'Handlers' => {'Char' => \&Char,
153                         'Doctype' => \&Doctype
154                         });
155
156    $self->{'parser'} = $parser;
157    $self->{'in_filename'} = 0;
158   
159   
160    return bless $self, $class;
161}
162
163
164sub get_default_process_exp
165{
166    return q^metadata\.xml$^;
167}
168
169sub get_doctype {
170    my $self = shift(@_);
171   
172    return "(Greenstone)?DirectoryMetadata"
173}
174
175sub can_process_this_file {
176    my $self = shift(@_);
177    my ($filename) = @_;
178
179    if (-f $filename && $self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
180    return 1; # its a file for us
181    }
182    return 0;
183}
184
185sub check_doctype {
186    my $self = shift (@_);
187   
188    my ($filename) = @_;
189
190    if (open(XMLIN,"<$filename")) {
191    my $doctype = $self->get_doctype();
192    ## check whether the doctype has the same name as the root element tag
193    while (defined (my $line = <XMLIN>)) {
194        ## find the root element
195        if ($line =~ /<([\w\d:]+)[\s>]/){
196        my $root = $1;
197        if ($root !~ $doctype){
198            close(XMLIN);
199            return 0;
200        }
201        else {
202            close(XMLIN);
203            return 1;
204        }
205        }
206    }
207    close(XMLIN);
208    }
209   
210    return undef; # haven't found a valid line
211   
212}
213
214sub file_block_read {
215    my $self = shift (@_);
216    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
217   
218    my $filename_full_path = &util::filename_cat($base_dir, $file);
219    return undef unless $self->can_process_this_file($filename_full_path);   
220
221    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
222       
223        my $lower_drive = $filename_full_path;
224        $lower_drive =~ s/^([A-Z]):/\l$1:/i;
225       
226        my $upper_drive = $filename_full_path;
227        $upper_drive =~ s/^([A-Z]):/\u$1:/i;
228       
229        $block_hash->{'metadata_files'}->{$lower_drive} = 1;
230        $block_hash->{'metadata_files'}->{$upper_drive} = 1;       
231    }
232    else {
233        $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
234    }
235
236    return 1;
237}
238
239sub metadata_read
240{
241    my $self = shift (@_);
242    my ($pluginfo, $base_dir, $file, $block_hash,
243    $extrametakeys, $extrametadata,$extrametafile,
244    $processor, $gli, $aux) = @_;
245
246    my $filename = &util::filename_cat($base_dir, $file);
247    return undef unless $self->can_process_this_file($filename);   
248   
249    $self->{'metadata-file'} = $file;
250    $self->{'metadata-filename'} = $filename;
251   
252    my $outhandle = $self->{'outhandle'};
253   
254    print STDERR "\n<Processing n='$file' p='MetadataXMLPlugin'>\n" if ($gli);
255    print $outhandle "MetadataXMLPlugin: processing $file\n" if ($self->{'verbosity'})> 1;
256    # add the file to the block list so that it won't be processed in read, as we will do all we can with it here
257    &util::block_filename($block_hash,$filename);
258
259    $self->{'metadataref'} = $extrametadata;
260    $self->{'metafileref'} = $extrametafile;
261    $self->{'metakeysref'} = $extrametakeys;
262   
263    eval {
264    $self->{'parser'}->parsefile($filename);
265    };
266
267    if ($@) {
268    print STDERR "**** Error is: $@\n";
269    my $plugin_name = ref ($self);
270    my $failhandle = $self->{'failhandle'};
271    print $outhandle "$plugin_name failed to process $file ($@)\n";
272    print $failhandle "$plugin_name failed to process $file ($@)\n";
273    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
274    return -1; #error
275    }
276
277    return 1;
278
279}
280
281
282# Updated by Jeffrey 2010/04/16 @ DL Consulting Ltd.
283# Get rid off the global $self as it cause problems when there are 2+ MetadataXMLPlugin in your collect.cfg...
284# For example when you have an OAIMetadataXMLPlugin that is a child of MetadataXMLPlugin
285sub Doctype {$_[0]->{'PluginObj'}->xml_doctype(@_);}
286sub StartTag {$_[0]->{'PluginObj'}->xml_start_tag(@_);}
287sub EndTag {$_[0]->{'PluginObj'}->xml_end_tag(@_);}
288sub Text {$_[0]->{'PluginObj'}->xml_text(@_);}
289
290
291sub xml_doctype {
292    my $self = shift(@_);
293    my ($expat, $name, $sysid, $pubid, $internal) = @_;
294
295    # allow the short-lived and badly named "GreenstoneDirectoryMetadata" files
296    # to be processed as well as the "DirectoryMetadata" files which should now
297    # be created by import.pl
298    die if ($name !~ /^(Greenstone)?DirectoryMetadata$/);
299}
300
301sub xml_start_tag {
302    my $self = shift(@_);
303    my ($expat, $element) = @_;
304   
305    if ($element eq "FileSet") {
306    $self->{'saved_targets'} = [];
307    $self->{'saved_metadata'} = {};
308    }
309    elsif ($element eq "FileName") {
310    $self->{'in_filename'} = 1;
311    }
312    elsif ($element eq "Metadata") {
313    $self->{'metadata_name'} = $_{'name'};
314    $self->{'metadata_value'} = "";
315    if ((defined $_{'mode'}) && ($_{'mode'} eq "accumulate")) {
316        $self->{'metadata_accumulate'} = 1;
317    } else {
318        $self->{'metadata_accumulate'} = 0;
319    }
320    }
321}
322
323sub xml_end_tag {
324    my $self = shift(@_);
325    my ($expat, $element) = @_;
326
327    if ($element eq "FileSet") {
328    foreach my $target (@{$self->{'saved_targets'}}) {
329        my $file_metadata = $self->{'metadataref'}->{$target};
330        my $saved_metadata = $self->{'saved_metadata'};
331
332        if (!defined $file_metadata) {
333        $self->{'metadataref'}->{$target} = $saved_metadata;
334
335        # not had target before
336        push (@{$self->{'metakeysref'}}, $target);
337        }
338        else {
339        &metadatautil::combine_metadata_structures($file_metadata,$saved_metadata);
340        }
341
342       
343        # now record which metadata.xml file it came from
344
345        my $file = $self->{'metadata-file'};
346        my $filename = $self->{'metadata-filename'};
347
348        if (!defined $self->{'metafileref'}->{$target}) {
349            $self->{'metafileref'}->{$target} = {};
350        }
351
352        $self->{'metafileref'}->{$target}->{$file} = $filename
353    }
354    }
355    elsif ($element eq "FileName") {
356    $self->{'in_filename'} = 0;
357    }
358    elsif ($element eq "Metadata") {
359    # text read in by XML::Parser is in Perl's binary byte value
360    # form ... need to explicitly make it UTF-8
361   
362    my $metadata_name = decode("utf-8",$self->{'metadata_name'});
363    my $metadata_value = decode("utf-8",$self->{'metadata_value'});
364   
365    &metadatautil::store_saved_metadata($self,
366                        $metadata_name, $metadata_value,
367                        $self->{'metadata_accumulate'});
368    $self->{'metadata_name'} = "";
369    }
370
371}
372
373sub xml_text {
374    my $self = shift(@_);
375
376    if ($self->{'in_filename'}) {
377    # $_ == FileName content
378    push (@{$self->{'saved_targets'}}, $_);
379    }
380    elsif (defined ($self->{'metadata_name'}) && $self->{'metadata_name'} ne "") {
381    # $_ == Metadata content
382    $self->{'metadata_value'} = $_;
383    }
384}
385
386# This Char function overrides the one in XML::Parser::Stream to overcome a
387# problem where $expat->{Text} is treated as the return value, slowing
388# things down significantly in some cases.
389sub Char {
390    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
391
392#    if ($]<5.008) {
393#   use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+ and Perl 5.6
394#    }
395    $_[0]->{'Text'} .= $_[1];
396    return undef;
397}
398
399
400
4011;
Note: See TracBrowser for help on using the browser.